## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 424365 22.7 885382 47.3 654177 35.0
## Vcells 835259 6.4 8388608 64.0 1840104 14.1
Here we the ABCD release 3.0 data-set
The following libraries and default settings were used during the analysis:
options(scipen = 999)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(broom)
library(eNetXplorer)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.0-2
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(pander)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
## ✓ dials 0.0.9 ✓ rsample 0.0.8
## ✓ infer 0.5.3 ✓ tune 0.1.2
## ✓ modeldata 0.1.0 ✓ workflows 0.2.1
## ✓ parsnip 0.1.4 ✓ yardstick 0.0.7
## ✓ recipes 0.1.15
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x Matrix::expand() masks tidyr::expand()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x Matrix::pack() masks tidyr::pack()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## x Matrix::unpack() masks tidyr::unpack()
## x recipes::update() masks Matrix::update(), stats::update()
library("cowplot")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:eNetXplorer':
##
## export
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
##parallel map
library("ggplot2")
library("furrr")
## Loading required package: future
library("party", quietly = TRUE)
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:tune':
##
## parameters
## The following object is masked from 'package:parsnip':
##
## fit
## The following object is masked from 'package:dials':
##
## parameters
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
library("permimp")
theme_set(theme_bw() + theme(panel.grid = element_blank()))
## parallel processing number of cores register
all_cores <- parallel::detectCores(logical = FALSE) - 2
We first loaded all of the relevant data files (not shown here as they refer to local directories):
The aim is to compute the following:
structural mri: cortical thickness and subcortical value
resting state mri: Gordon network correlations
stop signal task (SST): any stop vs correct go contrast
MID (monetary incentive): reward vs neutral contrast
Nback task mri scan: 2 back vs 0 back contrast
DTI scan: FA from Hagler atlas
###loading site and scanner information
MRIinfo <-tibble::as_tibble(read.csv(paste0(datafolder, "ABCD_MRI01_DATA_TABLE.csv")))
Siteinfo <-tibble::as_tibble(read.csv(paste0(datafolder, "ABCD_LT01_DATA_TABLE.csv")))
MriandSite <- left_join(MRIinfo,Siteinfo, by=c('SUBJECTKEY','EVENTNAME'))
MRIinfo %>% count(EVENTNAME,SEX)
## # A tibble: 4 x 3
## EVENTNAME SEX n
## <chr> <chr> <int>
## 1 2_year_follow_up_y_arm_1 F 2617
## 2 2_year_follow_up_y_arm_1 M 3076
## 3 baseline_year_1_arm_1 F 5631
## 4 baseline_year_1_arm_1 M 6161
###loading response variables
NIH_TB <-tibble::as_tibble(read.csv(paste0(datafolder,"ABCD_TBSS01_DATA_TABLE.csv")))
LittleMan <-tibble::as_tibble(read.csv(paste0(datafolder,"LMTP201_DATA_TABLE.csv")))
Pearson <-tibble::as_tibble(read.csv(paste0(datafolder,"ABCD_PS01_DATA_TABLE.csv")))
NBackBeh <- tibble::as_tibble(read.csv(paste0(datafolder, "ABCD_MRINBACK02_DATA_TABLE.csv")) )
###loading brain scan data
smri_data <- tibble::as_tibble(read.csv( paste0(datafold,"sMRIDestAsegReadableGgsegQCedWeighted.csv")))
Nback_data <- tibble::as_tibble(read.csv(paste0(datafold, "NBackDestAsegReadableGgseg3dQCed.csv")))
MID_data <- tibble::as_tibble(read.csv(paste0(datafold, "MIDDestAsegReadableGgseg3dQCed.csv")))
SST_data <- tibble::as_tibble(read.csv(paste0(datafold, "SSTDestAsegReadableGgseg3d_QCed.csv")))
rsmri_data <- tibble::as_tibble(read.csv(paste0(datafold, "RSBetNetExcludedNoCOMBATABCD3.csv")))
DTI_data <- tibble::as_tibble(read.csv(paste0(datafold, "DTA_FA23Tracks.csv")))
### Note that we have not removed NA before Enet tuning to keep the most number of participants
short_names <- tibble::as_tibble(read.csv(paste0(scriptfold,"ShortNames_all.csv") ))
### load vision scan data, according to the documents some of the participants cannot see the IPad screen
vision_idx <- tibble::as_tibble(read.table(paste0(NDAfold, "abcd_svs01.txt"),header = TRUE))
site 01, 17, 19 have no DTI
MriandSite %>% select(SUBJECTKEY,EVENTNAME, SITE_ID_L) %>%
full_join(DTI_data, by = c('SUBJECTKEY','EVENTNAME')) %>%
select(FA_R_fornix, SITE_ID_L) %>%
group_by(SITE_ID_L) %>%
naniar::miss_var_summary() %>%
arrange(desc(pct_miss))
## # A tibble: 23 x 4
## # Groups: SITE_ID_L [23]
## SITE_ID_L variable n_miss pct_miss
## <chr> <chr> <int> <dbl>
## 1 site17 FA_R_fornix 890 100
## 2 site19 FA_R_fornix 840 100
## 3 site01 FA_R_fornix 562 99.8
## 4 site08 FA_R_fornix 113 22.6
## 5 site13 FA_R_fornix 233 21.7
## 6 site22 FA_R_fornix 7 20.6
## 7 site10 FA_R_fornix 203 17.7
## 8 site04 FA_R_fornix 209 17.4
## 9 site18 FA_R_fornix 85 15.3
## 10 site15 FA_R_fornix 100 15.2
## # … with 13 more rows
We 1) kept the within-network z-transformed zero correlations 2) took the means of between-network z-transformed zero correlations 3) converted these Fisher z to r and rearranged them into a correlation matrix 4) transformed this matrix to “partial” correlations with cor2pcor 5) converted them back to Fisher z
library("corpcor")
ROI_names <- c("Auditory","CinguloOpercular","CinguloParietal","Default","DorsalAttention",
"FrontoParietal","None","RetrosplenialTemporal","SensorimotorHand","SensorimotorMouth",
"Salience","VentralAttention")
stepwise_sum <- function(i){
return(sum(diff_seq[1:i]))
}
diff_seq = seq(from=11, to=1, by=-1)
diff_seq2=seq(from=11,to=2)+1
dim_vector <- dim(select(rsmri_data, starts_with("avg")))[2]
dim_matrix <- (-(-1)+sqrt(1-4*1*2*(-dim_vector)))/2
step_seq <- map_dbl(.x=seq(from=1, to=11,by =1),.f=stepwise_sum)
upp_seq<- append(12,step_seq+12)
low_seq <- append(1,map_dbl(.x=seq(from=1,to=10, by=1),.f=function(.x){
return(sum(diff_seq2[1:.x])+1)
}))%>%append(.,78)
partialcor <- function(data_split){
cor_RS <- select(data_split, starts_with("avg"))
cor_RS<-DescTools::FisherZInv(cor_RS[,order(names(cor_RS))])
avg_cor_array <- array(rep(NA,dim_matrix^2*dim(cor_RS)[1]),
dim = c(dim_matrix,dim_matrix,dim(cor_RS)[1]))
Par_cor_array <- array(rep(NA,dim_matrix^2*dim(cor_RS)[1]),
dim = c(dim_matrix,dim_matrix,dim(cor_RS)[1]))
row_to_matrix <- function(j){
target=matrix(rep(1,dim_matrix^2),nrow=dim_matrix)
for(i in 1:length(low_seq)){
target[(i+1):13,i]=target[i,(i+1):13]=as.numeric(cor_RS[j,low_seq[i]:upp_seq[i]])
}
target[12,13]=target[13,12]=as.numeric(cor_RS[j,upp_seq[i]])
return(target)
}
avg_cor_array <-map(.x=seq(from=1,to=dim(cor_RS)[1],by=1),.f=row_to_matrix)
Par_cor_array <- avg_cor_array %>% map(.,cor2pcor)
matrix_to_row <- function(j){
target_vec<-rep(NA,dim(cor_RS)[2])
for(i in 1:length(upp_seq)){
target_vec[low_seq[i]:upp_seq[i]]<-as.vector(Par_cor_array[[j]][i,(i+1):13])
}
return(DescTools::FisherZ(target_vec))
}
Par_cor_RS_list <- map(.x=seq(from=1,to=dim(cor_RS)[1],by=1),
.f=matrix_to_row)
Par_cor_RS_df <- as.data.frame(do.call(rbind,Par_cor_RS_list))
colnames(Par_cor_RS_df)<-names(cor_RS)
Par_cor_RS_df <- Par_cor_RS_df%>%
rename_at(.vars = vars(contains("avg")),
.funs = funs(sub("avg", "par", .)))%>%
mutate(SUBJECTKEY =data_split$SUBJECTKEY,
EVENTNAME=data_split$EVENTNAME)
data_return <- left_join(Par_cor_RS_df,
data_split,
by = c("SUBJECTKEY","EVENTNAME"))%>%
drop_na()
return(data_return)
}
rsmri_par_data <- partialcor(data_split = rsmri_data)
## Registered S3 method overwritten by 'DescTools':
## method from
## reorder.factor gplots
vision_idx <- vision_idx[-1,]%>% mutate(SUBJECTKEY=subjectkey)%>%
mutate(EVENTNAME=eventname)
data_all <- plyr::join_all(list(smri_data,
MID_data,
rsmri_par_data,
Nback_data,
SST_data,DTI_data,
NBackBeh,
Pearson,
LittleMan,
NIH_TB,
MriandSite,
vision_idx),
by=c('SUBJECTKEY','EVENTNAME'), type='full')
### full join all the data sets to keep all the participants after QC. Actually smri have the most number of participants
### there are only two participants that are not in the smri data set but in other modalities.
### change integer response variables into double for later scaling and IQR
data_all$PEA_RAVLT_LD_TRIAL_VII_TC <- as.double(data_all$PEA_RAVLT_LD_TRIAL_VII_TC)
data_all$PEA_WISCV_TRS <- as.double(data_all$PEA_WISCV_TRS)
54 subjects
data_removed <- data_all %>% filter(snellen_va_y == 0 | snellen_va_y == 1 | vis_flg == 2)
removed_subj <- data_removed$SUBJECTKEY
data_all <- data_all %>% filter(!SUBJECTKEY %in% removed_subj)
length(removed_subj)
## [1] 54
mri_vec <- c("Nback","SST","MID","rsmri","smri","DTI")
Resp_Var <- c('TFMRI_NB_ALL_BEH_C2B_RATE',
"NIHTBX_PICVOCAB_UNCORRECTED",
"NIHTBX_FLANKER_UNCORRECTED",
"NIHTBX_PATTERN_UNCORRECTED",
"NIHTBX_PICTURE_UNCORRECTED",
"NIHTBX_READING_UNCORRECTED",
"LMT_SCR_PERC_CORRECT",
"PEA_RAVLT_LD_TRIAL_VII_TC",
"TFMRI_SST_ALL_BEH_TOTAL_ISSRT" )
resp_var_plotting_long <- c("2-back working memory",
"Picture vocabulary",
"Flanker",
"Pattern comparison processing speed",
"Picture sequence memory",
"Oral reading recognition",
"Little man task correct percentage",
"RAVLT long delay trial VII total correct",
"Stop Signal Reaction Time (integration)"
)
resp_var_plotting_short <- c("2-back Work Mem",
"Pic Vocab","Flanker",
"Pattern Speed",
"Seq Memory",
"Reading Recog",
"Little Man",
"Audi Verbal",
"integration SSRT")
resp_var_plotting <- tibble("response" =Resp_Var,
"longer_name"=resp_var_plotting_long,
"short_name"=resp_var_plotting_short)
subj_info <- c('SUBJECTKEY', 'MRI_INFO_DEVICESERIALNUMBER', 'SITE_ID_L')
resp_names <- data_all %>% select(all_of(Resp_Var))%>%
names()%>%
set_names()
new_shorter_names <- short_names
new_shorter_names$roiShort[97] <- "R Subcentral"
new_shorter_names$roiShort <- str_squish(string = new_shorter_names$roiShort)
IQR and combat (to adjust batch effects for sMRI, rsfMRI and DTI)
# IQR function to remove data with 3 IQR rule
IQR_remove <- function(data_split){
data_split%>%
mutate_at(vars(- all_of(subj_info)), ~ ifelse(
.x > quantile(.x, na.rm = TRUE)[4] + 3 * IQR(.x, na.rm = TRUE) |
.x < quantile(.x, na.rm = TRUE)[2] - 3 * IQR(.x, na.rm = TRUE),
NA, .x)) %>%
drop_na() %>%
mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x))
}
library("sva")
## Loading required package: mgcv
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
##
## Attaching package: 'genefilter'
## The following objects are masked from 'package:yardstick':
##
## sens, spec
## The following object is masked from 'package:readr':
##
## spec
## Loading required package: BiocParallel
batch_adjust <- function(data_fold,resp, x,y){
ols_data <- select(data_fold, resp,starts_with(x), starts_with(y))
ols_matrix <- as.matrix(t(ols_data))
dimnames(ols_matrix)<- NULL
data_fold$SITE_ID_L <- as.factor(data_fold$SITE_ID_L)##have to be changed into factor before combat
data_fold$SITE_ID_L <- droplevels(data_fold$SITE_ID_L)##drop the empty levels of a factor
ols_matrix_com <- ComBat(dat = ols_matrix,batch = data_fold$SITE_ID_L)
ols_data_com <- data.frame(t(ols_matrix_com))
colnames(ols_data_com) <- colnames(ols_data)
return(ols_data_com)
}
# if only one type of data is adjusted (e.g., DTI only requires FA but sMRI has two corticol and sub, rs has two within and bet networks)
batch_adjustOne <- function(data_fold,resp, x){
ols_data <- select(data_fold, resp,starts_with(x))
ols_matrix <- as.matrix(t(ols_data))
dimnames(ols_matrix)<- NULL
data_fold$SITE_ID_L <- as.factor(data_fold$SITE_ID_L)##have to be changed into factor before combat
data_fold$SITE_ID_L <- droplevels(data_fold$SITE_ID_L)##drop the empty levels of a factor
ols_matrix_com <- ComBat(dat = ols_matrix,batch = data_fold$SITE_ID_L)
ols_data_com <- data.frame(t(ols_matrix_com))
colnames(ols_data_com) <- colnames(ols_data)
return(ols_data_com)
}
## look at the contrast of MID and SST
contrast_vec_MID <- MID_data %>%
select(.,ends_with("_ROI_Left.Cerebellum.Cortex")) %>%
colnames() %>%
str_remove_all(.,"_ROI_Left.Cerebellum.Cortex")
contrast_vec_SST <- SST_data %>%
select(.,ends_with("_ROI_Left.Cerebellum.Cortex")) %>%
colnames() %>%
str_remove_all(.,"_ROI_Left.Cerebellum.Cortex")
## extract the contrasts for analysis
data_analysis <- data_all %>%
select(starts_with("X2backvs0back_ROI_"),
starts_with("par"),
starts_with("Within"),
all_of(Resp_Var),
all_of(subj_info),
starts_with("antiRewVsNeu" ),
starts_with("anystopvscorrectgo" ),
starts_with("ASEG"),
starts_with("Dest_Thick"))
note 2nd time point doesn’t have “NIHTBX_LIST_UNCORRECTED” “NIHTBX_CARDSORT_UNCORRECTED” “PEA_WISCV_TRS”
plotRes <- Resp_Var %>% map(.,~ggplot(data = data_all,
mapping = aes_string(x = data_all[[.]], color = "EVENTNAME")) +
geom_density() +
labs(x= resp_var_plotting$short_name[which(resp_var_plotting$response==.)]) +
labs(y= "") +
scale_color_discrete(name ="",
breaks=c("2_year_follow_up_y_arm_1", "baseline_year_1_arm_1"),
labels=c("Follow Up", "Baseline")) +
theme_classic())
ggpubr::ggarrange(plotlist=plotRes, common.legend = TRUE)
## Warning: Removed 3040 rows containing non-finite values (stat_density).
## Warning: Removed 3040 rows containing non-finite values (stat_density).
## Warning: Removed 428 rows containing non-finite values (stat_density).
## Warning: Removed 320 rows containing non-finite values (stat_density).
## Warning: Removed 371 rows containing non-finite values (stat_density).
## Warning: Removed 343 rows containing non-finite values (stat_density).
## Warning: Removed 455 rows containing non-finite values (stat_density).
## Warning: Removed 410 rows containing non-finite values (stat_density).
## Warning: Removed 474 rows containing non-finite values (stat_density).
## Warning: Removed 6659 rows containing non-finite values (stat_density).
First we removed sites that have fewer than 100 participants. But none has fewer than 100.
The data structure of the analysis is comprised of four parts:
1) training set (1st layer training set): predicting cognition from each modality via the elastic net,
validation set (2nd layer training set): combining predicted values from all modalities via the random forest,
test set of the baseline period,
test set of the second year follow up.
To protect data leakage, we select the subjects with the same subject key for the baseline test set and the followup test set.
site_count <- data_analysis %>% count(SITE_ID_L)
baseline_data <- data_all %>% filter(EVENTNAME == "baseline_year_1_arm_1")
followup_data <- data_all %>% filter(EVENTNAME == "2_year_follow_up_y_arm_1")
## all sites have more than 100!
baseline_analysis <- baseline_data %>% filter (SITE_ID_L != site_count$SITE_ID_L[which(site_count$n< 100)])
followup_analysis <- followup_data %>% filter (SITE_ID_L != site_count$SITE_ID_L[which(site_count$n< 100)])
subj_diff <- setdiff(baseline_analysis$SUBJECTKEY,followup_analysis$SUBJECTKEY)
baseline_trainvalid <- baseline_analysis %>% subset(.,SUBJECTKEY %in% subj_diff)
baseline_test <- baseline_analysis %>% subset(.,SUBJECTKEY %in% followup_analysis$SUBJECTKEY)
set.seed(123456)
### divide the training dataset into two
### one for enet tuning and one for random forest tuning
baseline_two_fold <-vfold_cv(data =baseline_trainvalid, v=2,repeats = 1)
# training set (1st layer training set): predicting cognition from each modality via the elastic net,
dim(analysis(baseline_two_fold$splits[[1]]))[1]
## [1] 3041
# validation set (2nd layer training set): combining predicted values from all modalities via the random forest,
dim(assessment(baseline_two_fold$splits[[1]]))[1]
## [1] 3042
# test set of the baseline period,
dim(baseline_test)[1]
## [1] 5622
# test set of the second year follow up
dim(followup_analysis)[1]
## [1] 5656
Here we used Combat, an empirical Bayesian method, to adjust for batch effects (separately for the train and test). Because Nback, SST and MID task-based fMRI are contrast data, batch effects are negligible. And accordingly, we decided not to apply combat on the task-based fMRI, and only applied it on the resting state mri, structure mri and DTI. https://www.biorxiv.org/content/10.1101/309260v1.full.pdf
# for models that require no Combat
enet_processing_nocom <-
function(resp_var,
split_train = analysis(baseline_two_fold$splits[[1]]),
split_valid = assessment(baseline_two_fold$splits[[1]]),
split_test = baseline_test,followup_test,contrast_name)
{
data_train <- split_train %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name)) %>%
drop_na() %>%
IQR_remove()
data_valid <- split_valid %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name)) %>%
drop_na() %>%
IQR_remove()
baseline_test_set <- split_test %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name)) %>%
drop_na() %>%
IQR_remove()
followup_test_set <- followup_test %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name)) %>%
drop_na() %>%
IQR_remove()
ols_data_train <- select(data_train, resp_var, starts_with(contrast_name))
ols_data_valid <- select(data_valid,resp_var,starts_with(contrast_name))
ols_test_baseline <- select(baseline_test_set,resp_var,starts_with(contrast_name))
ols_test_followup <- select(followup_test_set,resp_var,starts_with(contrast_name))
return(list(ols_data_train=ols_data_train,
ols_data_valid=ols_data_valid,
ols_test_baseline=ols_test_baseline,
ols_test_followup=ols_test_followup,
data_valid_id=tibble(SUBJECTKEY=data_valid$SUBJECTKEY),
baseline_test_id=tibble(SUBJECTKEY=baseline_test_set$SUBJECTKEY),
followup_test_id=tibble(SUBJECTKEY=followup_test_set$SUBJECTKEY),
data_train_id=tibble(SUBJECTKEY=data_train$SUBJECTKEY)))
}
# for models that require Combat
enet_processing_com <-
function(resp_var,
split_train = analysis( baseline_two_fold$splits[[1]]),
split_valid = assessment(baseline_two_fold$splits[[1]]),
split_test=baseline_test,followup_test, measure_one,measure_two)
{
data_train <- split_train %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two)) %>%
drop_na() %>%
IQR_remove()
data_valid <- split_valid %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two)) %>%
drop_na()%>%
IQR_remove()
baseline_test_set <- split_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two))%>%
drop_na()%>%
IQR_remove()
followup_test_set <- followup_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two))%>%
drop_na()%>%
IQR_remove()
ols_data_train <-
batch_adjust(data_fold = data_train,
resp = resp_var,
x=measure_one,
y=measure_two)
ols_data_valid <-
batch_adjust(data_fold = data_valid,
resp = resp_var,
x=measure_one,
y=measure_two)
ols_test_baseline <-
batch_adjust(data_fold = baseline_test_set,
resp = resp_var,
x=measure_one,
y=measure_two)
ols_test_followup <-
batch_adjust(data_fold = followup_test_set,
resp = resp_var,
x=measure_one,
y=measure_two)
return(list(ols_data_train=ols_data_train,
ols_data_valid=ols_data_valid,
ols_test_baseline=ols_test_baseline,
ols_test_followup=ols_test_followup,
data_valid_id=tibble(SUBJECTKEY=data_valid$SUBJECTKEY),
baseline_test_id=tibble(SUBJECTKEY=baseline_test_set$SUBJECTKEY),
followup_test_id=tibble(SUBJECTKEY=followup_test_set$SUBJECTKEY),
data_train_id=tibble(SUBJECTKEY=data_train$SUBJECTKEY)))
}
# for models that require Combat that has one type of data. so far this was used for DTI only. This is bc DTI only one prefix ("fa")
enet_processing_comOne <-
function(resp_var,
split_train = analysis( baseline_two_fold$splits[[1]]),
split_valid = assessment(baseline_two_fold$splits[[1]]),
split_test=baseline_test,followup_test, measure_one)
{
data_train <- split_train %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one)) %>%
drop_na() %>%
IQR_remove()
data_valid <- split_valid %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one)) %>%
drop_na()%>%
IQR_remove()
baseline_test_set <- split_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one))%>%
drop_na()%>%
IQR_remove()
followup_test_set <- followup_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one))%>%
drop_na()%>%
IQR_remove()
ols_data_train <-
batch_adjustOne(data_fold = data_train,
resp = resp_var,
x=measure_one)
ols_data_valid <-
batch_adjustOne(data_fold = data_valid,
resp = resp_var,
x=measure_one)
ols_test_baseline <-
batch_adjustOne(data_fold = baseline_test_set,
resp = resp_var,
x=measure_one)
ols_test_followup <-
batch_adjustOne(data_fold = followup_test_set,
resp = resp_var,
x=measure_one)
return(list(ols_data_train=ols_data_train,
ols_data_valid=ols_data_valid,
ols_test_baseline=ols_test_baseline,
ols_test_followup=ols_test_followup,
data_valid_id=tibble(SUBJECTKEY=data_valid$SUBJECTKEY),
baseline_test_id=tibble(SUBJECTKEY=baseline_test_set$SUBJECTKEY),
followup_test_id=tibble(SUBJECTKEY=followup_test_set$SUBJECTKEY),
data_train_id=tibble(SUBJECTKEY=data_train$SUBJECTKEY)))
}
this function will fitting enet models and to apply them the 2nd layer model and to the test sets
enet_tuning <- function(resp_var, data_list){
#recipe train and test data are from ols as the variables are the same
norm_recipe <-recipe(as.formula(paste0(resp_var, "~ .")),
data =data_list[["ols_data_train"]] ) %>%
step_dummy(all_nominal()) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
# estimate the means and standard deviations
prep(training = data_list[["ols_data_train"]], retain = TRUE)
## model
glmn_mod <- linear_reg(penalty = tune() ,mixture =tune()) %>%
set_engine("glmnet")
set.seed(123456)
# 10fold split within the training data
all_rs <- rsample::vfold_cv(data=data_list[["ols_data_train"]],
v = 10, repeats = 1)
glmn_wfl <- workflow() %>%
add_recipe(norm_recipe) %>%
add_model(glmn_mod)
## penalty is lambda, mixture is alpha
## log10 transformed to get lambda between 10^-10 to 10^1
## this expands the default values (10^-10 to 10^0) from tidymodels https://github.com/tidymodels/dials/issues/140
glmn_set <- dials::parameters(penalty(range = c(-10,1), trans = log10_trans()),mixture())
## 200 levels of lambda and 11 levels of alpha (0, .1 to 1)
glmn_grid <- grid_regular(glmn_set, levels = c(200,11))
ctrl <- control_grid(save_pred = TRUE, verbose = TRUE)
glmn_tune <- tune_grid(glmn_wfl,
resamples = all_rs,
grid = glmn_grid,
metrics = metric_set(mae),
control = ctrl)
##select the best tuned lambda and alpha
best_glmn <- select_best(glmn_tune, metric = "mae")
## use the best tunes parameters to fit the test data
glmn_wfl_final <-
glmn_wfl %>%
finalize_workflow(best_glmn) %>%
parsnip::fit(data = data_list[["ols_data_train"]]) ##clearly another package is blocking this function
### this is the prediction for random forest tuning
pred_glmn <- predict(glmn_wfl_final, new_data =data_list[["ols_data_valid"]])
###this is the prediction for test set
pred_test_baseline <- predict(glmn_wfl_final, new_data =data_list[["ols_test_baseline"]])
pred_test_followup <- predict(glmn_wfl_final, new_data =data_list[["ols_test_followup"]])
##compute the correlation between predicted results and the test data
performance_glmn <- cor(pred_glmn$.pred,data_list[["ols_data_valid"]][[resp_var]])
performance_baseline <- cor(pred_test_baseline$.pred,data_list[["ols_test_baseline"]][[resp_var]])
performance_followup <- cor(pred_test_followup$.pred,data_list[["ols_test_followup"]][[resp_var]])
best_glmn <- best_glmn %>%
mutate(performance=performance_glmn,
baseline_performance = performance_baseline,
followup_performance=performance_followup)
return(list("param_tune"=best_glmn,
"predicted"=tibble(SUBJECTKEY=data_list[["data_valid_id"]]$SUBJECTKEY,
"pred"=pred_glmn$.pred),
"predicted_baseline"=tibble(SUBJECTKEY=data_list[["baseline_test_id"]]$SUBJECTKEY,
"pred"=pred_test_baseline$.pred),
"predicted_followup"=tibble(SUBJECTKEY=data_list[["followup_test_id"]]$SUBJECTKEY,
"pred"=pred_test_followup$.pred),
"obs_train"=length(data_list[["data_train_id"]]$SUBJECTKEY),
"obs_valid"=length(data_list[["data_valid_id"]]$SUBJECTKEY)))
}
Elastic net tuning is saved together with random forest results.
Nback_enet_data <- resp_names %>%
map(.,~enet_processing_nocom(resp_var = .,contrast_name = "X2backvs0back_ROI_",followup_test = followup_analysis))
##change this code before redo the random forest
##the codes are changed but not run except for the first one
SST_enet_data <- resp_names %>%
map(.,~enet_processing_nocom(resp_var = .,
followup_test = followup_analysis,
contrast_name = "anystopvscorrectgo_ROI_"))
MID_enet_data <- resp_names %>%
map(.,~enet_processing_nocom(resp_var = .,
followup_test = followup_analysis,
contrast_name = "antiRewVsNeu_ROI_"))
DTI_enet_data <- resp_names %>%
map(.,~enet_processing_comOne(resp_var = .,
followup_test = followup_analysis,
measure_one = "FA_"))
smri_enet_data <- resp_names %>%
map(.,~enet_processing_com(resp_var = .,
followup_test = followup_analysis,
measure_one = "ASEG_",
measure_two = "Dest_Thick_"))
rsmri_enet_data <- resp_names %>%
map(.,~enet_processing_com(resp_var = .,
followup_test = followup_analysis,
measure_one = "par",
measure_two = "Within"))
enet_data_all <- list("Nback"=Nback_enet_data,
"SST"=SST_enet_data,
"MID"=MID_enet_data,
"rsmri"=rsmri_enet_data,
"smri"=smri_enet_data,
"DTI"=DTI_enet_data)
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
Nback_results <- resp_names %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = Nback_enet_data[[.]]),
.options = furrr_options(seed=TRUE))
##change this code before redo the random forest
##the codes are changed but not run except for the first one
SST_results <- resp_names %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = SST_enet_data[[.]]),
.options = furrr_options(seed=TRUE))
MID_results <- resp_names %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = MID_enet_data[[.]]),
.options = furrr_options(seed=TRUE))
DTI_results <- resp_names %>%
future_map(.,~enet_tuning(resp_var =.
,data_list = DTI_enet_data[[.]]),
.options = furrr_options(seed=TRUE))
smri_results <- resp_names %>%
future_map(.,~enet_tuning(resp_var =.
,data_list = smri_enet_data[[.]]),
.options = furrr_options(seed=TRUE))
rsmri_results <- resp_names %>%
future_map(.,~enet_tuning(resp_var =.
,data_list = rsmri_enet_data[[.]]),
.options = furrr_options(seed=TRUE))
enet_results_all <- list("Nback_results"=Nback_results,
"SST_results"=SST_results,
"MID_results"=MID_results,
"rsmri_results"=rsmri_results,
"smri_results"=smri_results,
"DTI_results"=DTI_results)
modality_vec <- c("Nback","SST","MID","rsmri","smri","DTI")
modality_results <- paste0(modality_vec,"_results")
datasplits_number = function(data_list, resp_vec, resp_plotting){
number_datasplits <- modality_results %>% map(.,function(modality_resp=.){
data_input <- data_list[[modality_resp]]
one_modality <- resp_vec %>% map(.,function(resp=.){
one_allset <- tibble(enet_train = data_input[[resp]][["obs_train"]],
stack_train=data_input[[resp]][["obs_valid"]],
baseline=dim(data_input[[resp]][["predicted_baseline"]])[1],
followup=dim(data_input[[resp]][["predicted_followup"]])[1],
response=resp)
return(one_allset)
})
})
names(number_datasplits)<- modality_results
number_datasplits <- number_datasplits %>% map(.,~do.call(rbind,.))
number_datasplits <- number_datasplits %>% map(.,~left_join(.,resp_plotting,by="response"))
modality_results %>% map(.,~ mutate(number_datasplits[[.]],
modality=str_remove_all(.,"_results"))%>%
select(modality,
enet_train,
stack_train,
baseline,
followup,
short_name)%>%
kableExtra::kbl(caption = paste0("Sample size of all the data sets ")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria") # seems to have problem knitting
)
}
datasplits_number(data_list = enet_results_all,resp_vec = resp_names,resp_plotting = resp_var_plotting)
[[1]]
| modality | enet_train | stack_train | baseline | followup | short_name |
|---|---|---|---|---|---|
| Nback | 1417 | 1385 | 2734 | 3183 | 2-back Work Mem |
| Nback | 1397 | 1364 | 2719 | 3078 | Pic Vocab |
| Nback | 1398 | 1363 | 2709 | 3131 | Flanker |
| Nback | 1397 | 1361 | 2708 | 3118 | Pattern Speed |
| Nback | 1397 | 1364 | 2717 | 3129 | Seq Memory |
| Nback | 1392 | 1364 | 2712 | 3073 | Reading Recog |
| Nback | 1395 | 1361 | 2642 | 3175 | Little Man |
| Nback | 1392 | 1351 | 2682 | 3123 | Audi Verbal |
| Nback | 1162 | 1148 | 2285 | 2738 | integration SSRT |
| modality | enet_train | stack_train | baseline | followup | short_name |
|---|---|---|---|---|---|
| SST | 1403 | 1428 | 2755 | 3001 | 2-back Work Mem |
| SST | 1447 | 1458 | 2854 | 2987 | Pic Vocab |
| SST | 1447 | 1452 | 2848 | 3039 | Flanker |
| SST | 1445 | 1452 | 2848 | 3028 | Pattern Speed |
| SST | 1447 | 1456 | 2850 | 3040 | Seq Memory |
| SST | 1439 | 1458 | 2846 | 2983 | Reading Recog |
| SST | 1440 | 1452 | 2772 | 3085 | Little Man |
| SST | 1444 | 1446 | 2819 | 3022 | Audi Verbal |
| SST | 1469 | 1480 | 2872 | 3088 | integration SSRT |
| modality | enet_train | stack_train | baseline | followup | short_name |
|---|---|---|---|---|---|
| MID | 1599 | 1594 | 3066 | 3221 | 2-back Work Mem |
| MID | 1647 | 1635 | 3213 | 3222 | Pic Vocab |
| MID | 1646 | 1628 | 3210 | 3281 | Flanker |
| MID | 1647 | 1630 | 3206 | 3270 | Pattern Speed |
| MID | 1651 | 1635 | 3211 | 3279 | Seq Memory |
| MID | 1643 | 1634 | 3205 | 3215 | Reading Recog |
| MID | 1641 | 1631 | 3132 | 3326 | Little Man |
| MID | 1639 | 1621 | 3175 | 3261 | Audi Verbal |
| MID | 1295 | 1323 | 2579 | 2771 | integration SSRT |
| modality | enet_train | stack_train | baseline | followup | short_name |
|---|---|---|---|---|---|
| rsmri | 1871 | 1900 | 3556 | 4051 | 2-back Work Mem |
| rsmri | 2041 | 2056 | 3862 | 4270 | Pic Vocab |
| rsmri | 2037 | 2045 | 3858 | 4350 | Flanker |
| rsmri | 2039 | 2052 | 3853 | 4331 | Pattern Speed |
| rsmri | 2043 | 2054 | 3860 | 4347 | Seq Memory |
| rsmri | 2041 | 2054 | 3846 | 4258 | Reading Recog |
| rsmri | 2034 | 2054 | 3756 | 4417 | Little Man |
| rsmri | 2034 | 2041 | 3823 | 4327 | Audi Verbal |
| rsmri | 1503 | 1535 | 2970 | 3438 | integration SSRT |
| modality | enet_train | stack_train | baseline | followup | short_name |
|---|---|---|---|---|---|
| smri | 2133 | 2178 | 4028 | 4380 | 2-back Work Mem |
| smri | 2467 | 2484 | 4622 | 4724 | Pic Vocab |
| smri | 2465 | 2484 | 4619 | 4800 | Flanker |
| smri | 2464 | 2478 | 4611 | 4785 | Pattern Speed |
| smri | 2467 | 2485 | 4619 | 4805 | Seq Memory |
| smri | 2455 | 2482 | 4608 | 4712 | Reading Recog |
| smri | 2464 | 2488 | 4498 | 4881 | Little Man |
| smri | 2450 | 2472 | 4571 | 4775 | Audi Verbal |
| smri | 1661 | 1706 | 3292 | 3624 | integration SSRT |
| modality | enet_train | stack_train | baseline | followup | short_name |
|---|---|---|---|---|---|
| DTI | 2104 | 2111 | 3742 | 4117 | 2-back Work Mem |
| DTI | 2324 | 2312 | 4159 | 4369 | Pic Vocab |
| DTI | 2321 | 2313 | 4154 | 4439 | Flanker |
| DTI | 2322 | 2308 | 4151 | 4428 | Pattern Speed |
| DTI | 2326 | 2312 | 4155 | 4439 | Seq Memory |
| DTI | 2324 | 2312 | 4143 | 4357 | Reading Recog |
| DTI | 2317 | 2311 | 4054 | 4510 | Little Man |
| DTI | 2320 | 2301 | 4126 | 4412 | Audi Verbal |
| DTI | 1637 | 1650 | 3051 | 3424 | integration SSRT |
enetXplorer allows us to use permutation to test the significance of each feature
### NOT RUN by default, takes long time
enet_results_xplorer <- enet_results_all
names(enet_results_xplorer) <- modality_vec
fitting_enetxplorer <- function(resp_vec , modality_var,data_input,tuning_input){
data_split=data_input[[modality_var]]
enet_train <- resp_vec %>% map(., ~select(data_split[[.]][["ols_data_train"]],-all_of(.)))
enet_test <- resp_vec %>% map(., ~select(data_split[[.]][["ols_test_baseline"]],-all_of(.)))
enet_resp_train <- resp_vec %>% map(., ~select(data_split[[.]][["ols_data_train"]],all_of(.)))
enet_resp_test <- resp_vec %>% map(., ~select(data_split[[.]][["ols_test_baseline"]],all_of(.)))
doParallel::registerDoParallel(cores = 20)
fit_enet_all_IQR <-resp_vec %>%
future_map(.,~eNetXplorer(x = as.matrix(enet_train[[.]]) ,
y = as.vector(enet_resp_train[[.]][[.]]),
alpha =tuning_input[[modality_var]][[.]][["param_tune"]][["mixture"]],
n_fold = 10,
#nlambda.ext = 10,
#nlambda = 10, #uses the default 100 levels with the data-driven range defined by glmnet
scaled = TRUE,
seed = 123456),
.options = furrr_options(seed=TRUE))
return(fit_enet_all_IQR)
}
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
enet_tuning_fit <- modality_vec %>%
map(.,~fitting_enetxplorer(resp_vec = resp_names,
modality_var = .,
data_input = enet_data_all,
tuning_input = enet_results_xplorer))
names(enet_tuning_fit)<- modality_vec
here we show hyperparameters from enetXplorer.
Alpha is forced to be the same with the tidymodels approach. Lambda via glmnet adaptive range is close to the tidymodels approach. see more on this approach here
https://stats.stackexchange.com/questions/174897/choosing-the-range-and-grid-density-for-regularization-parameter-in-lasso The accuracy is out-of-bag correlations between predicted and observed within the training set. P value is based on the permutation test.
enet_fit_table <- function(modality_input,fit_input, tune_input,resp_input, plotting_names ){
fit_enet_all_IQR <- fit_input[[modality_input]]
glmn_tuning_all_IQR <- tune_input[[modality_input]]
lambdas_all <- vector("list", length = length(resp_input))
names(lambdas_all)<- resp_input
lambdas_all_best <- vector("list", length = length(resp_input))
names(lambdas_all_best)<- resp_input
summary_enet_all <- vector("list", length = length(resp_input))
names(summary_enet_all) <- resp_input
for(i in 1:length(resp_input)){
lambdas_all[[resp_input[i]]] <- fit_enet_all_IQR[[resp_input[i]]][["lambda_values"]]
lambdas_all_best[[resp_input[i]]] <- fit_enet_all_IQR[[resp_input[i]]][["best_lambda"]]
summary_enet_all[[resp_input[i]]]<- as_tibble(summary(fit_enet_all_IQR[[resp_input[i]]])[[2]]) %>% slice(1)
}
summary_enet_all_test <- summary_enet_all %>% bind_rows() %>%
mutate(respones = plotting_names$short_name)
summary_vec <- colnames(summary_enet_all_test)[-5]
summary_enet_all_test <- summary_vec %>% map(.,function(var_input=.){
summary_enet_all_test[[paste0(var_input)]] <- round(summary_enet_all_test[[paste0(var_input)]],4)
})
names(summary_enet_all_test)<- summary_vec
summary_enet_all_test <- summary_enet_all_test %>% bind_rows()%>%
mutate(respones = plotting_names$short_name)%>%
rename(.,Alpha = alpha, `Best-tune lambda` = lambda.max,
`Accuracy (Pearson r)` = QF.est,
`P-value` = model.vs.null.pval) %>%
#knitr::kable(caption = paste0("EnetXplorer fitting of ", modality_input))
kableExtra::kbl(caption = paste0("EnetXplorer fitting of ", modality_input)) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
# seems to have problem knitting
##problem solved by adding results='asis' option in knitting
}
modality_vec %>% map(.,~enet_fit_table(modality_input=.,
fit_input = enet_tuning_fit,
tune_input = enet_results_xplorer,
resp_input = resp_names,
plotting_names = resp_var_plotting))
[[1]]
| Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
|---|---|---|---|---|
| 0.0 | 0.7398 | 0.4706 | 0.0004 | 2-back Work Mem |
| 0.0 | 0.8318 | 0.3464 | 0.0004 | Pic Vocab |
| 0.0 | 1.5030 | 0.1769 | 0.0004 | Flanker |
| 0.3 | 0.1277 | 0.1183 | 0.0004 | Pattern Speed |
| 0.0 | 4.0530 | 0.1895 | 0.0004 | Seq Memory |
| 0.0 | 2.4728 | 0.2959 | 0.0004 | Reading Recog |
| 0.1 | 0.1947 | 0.2426 | 0.0004 | Little Man |
| 0.1 | 0.2181 | 0.1645 | 0.0004 | Audi Verbal |
| 0.0 | 0.8539 | 0.1153 | 0.0012 | integration SSRT |
| Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
|---|---|---|---|---|
| 0.5 | 0.0254 | 0.1282 | 0.0008 | 2-back Work Mem |
| 0.2 | 0.0484 | 0.1658 | 0.0004 | Pic Vocab |
| 0.5 | 0.0476 | 0.1196 | 0.0012 | Flanker |
| 0.0 | 0.0593 | 0.0581 | 0.0496 | Pattern Speed |
| 0.1 | 0.0018 | 0.0052 | 0.4390 | Seq Memory |
| 0.0 | 0.0062 | 0.0741 | 0.0216 | Reading Recog |
| 0.8 | 0.0314 | 0.0936 | 0.0064 | Little Man |
| 0.9 | 0.0518 | 0.0549 | 0.0396 | Audi Verbal |
| 0.0 | 0.5711 | 0.3137 | 0.0004 | integration SSRT |
| Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
|---|---|---|---|---|
| 0.0 | 0.6653 | 0.1379 | 0.0004 | 2-back Work Mem |
| 0.0 | 0.8611 | 0.2202 | 0.0004 | Pic Vocab |
| 0.0 | 2.6097 | 0.0704 | 0.0112 | Flanker |
| 0.0 | 2.0368 | 0.0783 | 0.0028 | Pattern Speed |
| 0.0 | 2.8319 | 0.0815 | 0.0064 | Seq Memory |
| 0.1 | 0.1873 | 0.2257 | 0.0004 | Reading Recog |
| 0.1 | 0.1866 | 0.0916 | 0.0040 | Little Man |
| 0.0 | 1.7944 | 0.1472 | 0.0004 | Audi Verbal |
| 1.0 | 0.0233 | 0.0760 | 0.0224 | integration SSRT |
| Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
|---|---|---|---|---|
| 0 | 0.0512 | 0.1677 | 0.0004 | 2-back Work Mem |
| 0 | 0.5142 | 0.2210 | 0.0004 | Pic Vocab |
| 0 | 3.0027 | 0.0962 | 0.0004 | Flanker |
| 0 | 0.3233 | 0.0822 | 0.0032 | Pattern Speed |
| 0 | 1.1274 | 0.0997 | 0.0004 | Seq Memory |
| 0 | 0.0907 | 0.1959 | 0.0004 | Reading Recog |
| 1 | 0.0304 | 0.1415 | 0.0004 | Little Man |
| 0 | 3.9336 | 0.0995 | 0.0008 | Audi Verbal |
| 1 | 0.0351 | 0.0645 | 0.0328 | integration SSRT |
| Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
|---|---|---|---|---|
| 0.0 | 4.8930 | 0.1455 | 0.0004 | 2-back Work Mem |
| 0.2 | 0.0418 | 0.2112 | 0.0004 | Pic Vocab |
| 1.0 | 0.0328 | 0.0754 | 0.0060 | Flanker |
| 0.0 | 1.8544 | 0.0612 | 0.0100 | Pattern Speed |
| 0.0 | 2.7896 | 0.1288 | 0.0004 | Seq Memory |
| 0.0 | 0.9688 | 0.1869 | 0.0004 | Reading Recog |
| 0.0 | 2.9783 | 0.1447 | 0.0004 | Little Man |
| 0.1 | 0.2536 | 0.0920 | 0.0004 | Audi Verbal |
| 0.0 | 4.6013 | 0.0647 | 0.0144 | integration SSRT |
| Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
|---|---|---|---|---|
| 0.0 | 0.7204 | 0.1089 | 0.0004 | 2-back Work Mem |
| 0.0 | 0.0129 | 0.2142 | 0.0004 | Pic Vocab |
| 0.2 | 0.0295 | 0.1105 | 0.0004 | Flanker |
| 0.0 | 0.3508 | 0.1073 | 0.0004 | Pattern Speed |
| 0.0 | 0.4257 | 0.1005 | 0.0004 | Seq Memory |
| 1.0 | 0.0019 | 0.1863 | 0.0004 | Reading Recog |
| 0.1 | 0.0544 | 0.1337 | 0.0004 | Little Man |
| 0.0 | 0.0939 | 0.0991 | 0.0004 | Audi Verbal |
| 0.0 | 1.5825 | -0.0135 | 0.4242 | integration SSRT |
extract_tibble <- function(elastic_mod, alpha_index) {
variable <- elastic_mod$feature_coef_wmean[, alpha_index] %>% names()
wmean <- elastic_mod$feature_coef_wmean[, alpha_index]
wsd <- elastic_mod$feature_coef_wsd[, alpha_index]
null_wmean <- elastic_mod$null_feature_coef_wmean[, alpha_index]
null_wsd <- elastic_mod$null_feature_coef_wsd[, alpha_index]
pvalue <- elastic_mod$feature_coef_model_vs_null_pval[, alpha_index]
tib <- tibble(variable, wmean, wsd, null_wmean, null_wsd, pvalue)
tib <- tib %>%
gather(key = 'placeholder', value = 'value', wmean, wsd, null_wmean, null_wsd) %>%
mutate(type = ifelse(str_detect(placeholder, 'null'), 'null', 'target'),
placeholder = (str_remove(placeholder, 'null_'))) %>%
mutate(type = factor(type, labels = c('Null', 'Target'))) %>%
spread(placeholder, value)
tib
}
modality_type_vec <- c("Nback","SST","MID")
str_type_vec <- c("X2backvs0back_ROI_","anystopvscorrectgo_ROI_","antiRewVsNeu_ROI_")
enet_interval_plot <-
function(data_input,
tuning_input,
modality_input,
str_input,
resp_input,
plotting_names=resp_var_plotting){
fit_enet_all_IQR <- data_input[[modality_input]]
names(tuning_input)<- modality_vec
glmn_tuning_all_IQR <- tuning_input[[modality_input]]
coefs_enet_all <- resp_input %>%
map(.,
~extract_tibble(
fit_enet_all_IQR[[.]],
alpha_index = paste0("a",glmn_tuning_all_IQR[[.]][["param_tune"]][["mixture"]])))
coefs_enet_all <- coefs_enet_all %>%
map(.,~filter(.,pvalue < 0.05) %>%
mutate(.,type = ifelse(type == 'Null',
'Null permuted models',
'Target models'),
roi = str_remove(variable, str_input))%>%
left_join( .,new_shorter_names,by="roi"))
roi_num_enet <- coefs_enet_all %>% map(.,~dim(.)[1])
max_roi_enet <- max(as.numeric(roi_num_enet))
coefs_enet_test <- coefs_enet_all[[resp_input[1]]] %>%
group_by(type)
coefs_enet_test <- coefs_enet_test%>% nest(-type)
coefs_enet_test[[2]][[1]] <-
coefs_enet_test[[2]][[1]] %>%
mutate(direction1 =
ifelse(coefs_enet_test[[2]][[1]]$wmean >=
median(coefs_enet_test[[2]][[1]]$wmean)|roi_num_enet[[resp_input[1]]] <=
floor(max_roi_enet/2),"big","small"))
coefs_enet_test[[2]][[2]] <- coefs_enet_test[[2]][[2]] %>%
mutate(direction1=coefs_enet_test[[2]][[1]]$direction1)
coefs_enet_test <- coefs_enet_test %>% unnest()
coefs_enet_all <- coefs_enet_all%>%map(.,~group_by(.,type))
coefs_enet_all <- coefs_enet_all%>%map(.,~nest(.,-type))
for(i in 1:length(resp_input)){
coefs_enet_all[[resp_input[i]]][["data"]][[2]]<-
coefs_enet_all[[resp_input[i]]][["data"]][[2]] %>%
mutate(direction = ifelse(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_enet[[resp_input[i]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_all[[resp_input[i]]][["data"]][[1]] <-
coefs_enet_all[[resp_input[i]]][["data"]][[1]] %>%
mutate(direction=coefs_enet_all[[resp_input[i]]][["data"]][[2]]$direction)
}
coefs_enet_all <- coefs_enet_all %>%map(.,~unnest(.))
resp_input %>% map(.,~ggplot(coefs_enet_all[[.]],
aes(x = fct_reorder(roiShort, wmean),
y = wmean,
ymax = wmean + 2 * wsd,
ymin = wmean - 2 * wsd,
col = type)) +
geom_pointrange(fatten = 0.5, key_glyph = 'point') +
scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 2)) +
scale_color_grey(start = 0.7, end = 0.5) +
coord_flip() +
guides(colour = guide_legend(override.aes = list(size = 2.5)))+
labs(x = 'Explanatory Variables (Brain Regions)',
y = 'Averaged Coefficient Across Models (±2 SD)',
col = 'Model type',
title = paste0(modality_input, " task-fMRI Predicting\n",
plotting_names$longer_name[[which(plotting_names$response==.)]])) +
facet_wrap(~ direction, scales = 'free_y') +
scale_color_manual(values = c("#56B4E9", "black"),labels = c("Permuted Null", "Target")) +
theme_bw() +
#theme(axis.text.y = element_text(angle = 15)) +
theme(legend.title=element_blank()) +
theme(legend.position = "top") +
theme(
axis.title.x = element_text(size = 15),
axis.text.x = element_text(size = 12),
axis.title.y = element_text(size = 15),
axis.text.y = element_text(size = 12),
legend.text = element_text(size = 15),
plot.title = element_text(size=15)) +
theme(
strip.background = element_blank(),
strip.text.x = element_blank()
)
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
)
}
map2(.x=modality_type_vec,
.y=str_type_vec,
~enet_interval_plot(data_input=enet_tuning_fit,
tuning_input=enet_results_all,
resp_input = resp_names,
modality_input = .x,
str_input = .y,
plotting_names=resp_var_plotting))
## [[1]]
## [[1]]$TFMRI_NB_ALL_BEH_C2B_RATE
##
## [[1]]$NIHTBX_PICVOCAB_UNCORRECTED
##
## [[1]]$NIHTBX_FLANKER_UNCORRECTED
##
## [[1]]$NIHTBX_PATTERN_UNCORRECTED
##
## [[1]]$NIHTBX_PICTURE_UNCORRECTED
##
## [[1]]$NIHTBX_READING_UNCORRECTED
##
## [[1]]$LMT_SCR_PERC_CORRECT
##
## [[1]]$PEA_RAVLT_LD_TRIAL_VII_TC
##
## [[1]]$TFMRI_SST_ALL_BEH_TOTAL_ISSRT
##
##
## [[2]]
## [[2]]$TFMRI_NB_ALL_BEH_C2B_RATE
##
## [[2]]$NIHTBX_PICVOCAB_UNCORRECTED
##
## [[2]]$NIHTBX_FLANKER_UNCORRECTED
##
## [[2]]$NIHTBX_PATTERN_UNCORRECTED
##
## [[2]]$NIHTBX_PICTURE_UNCORRECTED
##
## [[2]]$NIHTBX_READING_UNCORRECTED
##
## [[2]]$LMT_SCR_PERC_CORRECT
##
## [[2]]$PEA_RAVLT_LD_TRIAL_VII_TC
##
## [[2]]$TFMRI_SST_ALL_BEH_TOTAL_ISSRT
##
##
## [[3]]
## [[3]]$TFMRI_NB_ALL_BEH_C2B_RATE
##
## [[3]]$NIHTBX_PICVOCAB_UNCORRECTED
##
## [[3]]$NIHTBX_FLANKER_UNCORRECTED
##
## [[3]]$NIHTBX_PATTERN_UNCORRECTED
##
## [[3]]$NIHTBX_PICTURE_UNCORRECTED
##
## [[3]]$NIHTBX_READING_UNCORRECTED
##
## [[3]]$LMT_SCR_PERC_CORRECT
##
## [[3]]$PEA_RAVLT_LD_TRIAL_VII_TC
##
## [[3]]$TFMRI_SST_ALL_BEH_TOTAL_ISSRT
enet_interval_plot_smri <- function(data_input,tuning_input,resp_input,plotting_names=resp_var_plotting){
fit_enet_all_IQR <- data_input[["smri"]]
names(tuning_input)<- modality_vec
glmn_tuning_all_IQR <- tuning_input[["smri"]]
coefs_enet_all <- resp_input %>% map(.,~extract_tibble(fit_enet_all_IQR[[.]], alpha_index = paste0("a",glmn_tuning_all_IQR[[.]][["param_tune"]][["mixture"]])))
roi_vec <- coefs_enet_all[[1]][["variable"]]
roi_vec <- gsub("ASEG_","",roi_vec)
roi_vec <- gsub("Dest_Thick_","",roi_vec)
roi_frame <- tibble(variable=coefs_enet_all[[1]][["variable"]],roi=roi_vec)
coefs_enet_all <- coefs_enet_all %>% map(.,~left_join(.,roi_frame,by="variable")%>%filter(.,pvalue < 0.05) %>%
mutate(.,type = ifelse(type == 'Null', 'Null permuted models', 'Target models'))%>%
left_join( .,new_shorter_names,by="roi"))
roi_num_enet <- coefs_enet_all %>% map(.,~dim(.)[1])
max_roi_enet <- max(as.numeric(roi_num_enet))
coefs_enet_test <- coefs_enet_all[[resp_input[1]]] %>%
group_by(type)
coefs_enet_test <- coefs_enet_test%>% nest(-type)
coefs_enet_test[[2]][[1]] <- coefs_enet_test[[2]][[1]] %>%
mutate(direction1 = ifelse(coefs_enet_test[[2]][[1]]$wmean >= median(coefs_enet_test[[2]][[1]]$wmean)|roi_num_enet[[resp_input[1]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_test[[2]][[2]] <- coefs_enet_test[[2]][[2]] %>% mutate(direction1=coefs_enet_test[[2]][[1]]$direction1)
coefs_enet_test <- coefs_enet_test %>% unnest()
coefs_enet_all <- coefs_enet_all%>%map(.,~group_by(.,type))
coefs_enet_all <- coefs_enet_all%>%map(.,~nest(.,-type))
for(i in 1:length(resp_input)){
coefs_enet_all[[resp_input[i]]][["data"]][[2]]<-coefs_enet_all[[resp_input[i]]][["data"]][[2]] %>% mutate(direction = ifelse(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_enet[[resp_input[i]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_all[[resp_input[i]]][["data"]][[1]] <- coefs_enet_all[[resp_input[i]]][["data"]][[1]] %>% mutate(direction=coefs_enet_all[[resp_input[i]]][["data"]][[2]]$direction)
}
coefs_enet_all <- coefs_enet_all %>%map(.,~unnest(.))
resp_input %>% map(.,~ggplot(coefs_enet_all[[.]], aes(x = fct_reorder(roiShort, wmean),
y = wmean,
ymax = wmean + 2 * wsd, ymin = wmean - 2 * wsd,
col = type)) +
geom_pointrange(fatten = 0.5, key_glyph = 'point') +
scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 2)) +
scale_color_grey(start = 0.7, end = 0.5) +
coord_flip() +
guides(colour = guide_legend(override.aes = list(size = 2.5)))+
labs(x = 'Explanatory Variables (Brain Regions)',
y = 'Averaged Coefficient Across Models (±2 SD)',
col = 'Model type',
title = paste0("sMRI Predicting\n",plotting_names$longer_name[[which(plotting_names$response==.)]])) +
facet_wrap(~ direction, scales = 'free_y') +
scale_color_manual(values = c("#56B4E9", "black"),labels = c("Permuted Null", "Target")) +
theme_bw() +
#theme(axis.text.y = element_text(angle = 15)) +
theme(legend.title=element_blank()) +
theme(legend.position = "top") +
theme(
axis.title.x = element_text(size = 15),
axis.text.x = element_text(size = 12),
axis.title.y = element_text(size = 15),
axis.text.y = element_text(size = 12),
legend.text = element_text(size = 15),
plot.title = element_text(size=15)) +
theme(
strip.background = element_blank(),
strip.text.x = element_blank()
)
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
)
}
enet_interval_plot_smri(data_input=enet_tuning_fit,tuning_input=enet_results_all,resp_input = resp_names,plotting_names=resp_var_plotting)
## $TFMRI_NB_ALL_BEH_C2B_RATE
##
## $NIHTBX_PICVOCAB_UNCORRECTED
##
## $NIHTBX_FLANKER_UNCORRECTED
##
## $NIHTBX_PATTERN_UNCORRECTED
##
## $NIHTBX_PICTURE_UNCORRECTED
##
## $NIHTBX_READING_UNCORRECTED
##
## $LMT_SCR_PERC_CORRECT
##
## $PEA_RAVLT_LD_TRIAL_VII_TC
##
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
roi_names <- colnames(select(rsmri_par_data, starts_with("parAuditory")))%>% str_remove_all(.,"parAuditory")
roi_names_all <- c("Auditory",roi_names)
par_roi_names_all <- paste0("par", roi_names_all)
matrix_prepare <- function(data_list){
heat_matrix <- matrix(rep(0,13*13),nrow = 13)
for(i in 1:dim(data_list)[1]){
if(str_detect(data_list$roi[i],paste0("^","par"))==TRUE){
coor_matrix_1 <- which(as.vector(par_roi_names_all %>% map(.,~str_detect(data_list$roi[i],paste0("\\A",.)))%>%do.call(rbind,.)) ==TRUE)
coor_matrix_2 <- which(as.vector(roi_names_all %>% map(.,~str_detect(data_list$roi[i],paste0(.,"$")))%>%do.call(rbind,.)) ==TRUE)
heat_matrix[coor_matrix_1,coor_matrix_2]<- heat_matrix[coor_matrix_2,coor_matrix_1]<- data_list$estimate[i]
}
if(str_detect(data_list$roi[i],paste0("^","Within"))==TRUE){
coor_matrix <- which(as.vector(roi_names_all %>% map(.,~str_detect(data_list$roi[i],paste0("Within",.)))%>%do.call(rbind,.)) ==TRUE)
heat_matrix[coor_matrix,coor_matrix]<- data_list$estimate[i]
}
}
heat_matrix[lower.tri(heat_matrix)]<- NA
return(heat_matrix)
}
enet_interval_plot_rsmri <- function(data_input,tuning_input,resp_input,plotting_names=resp_var_plotting){
fit_ridge_all_RS_par <- data_input[["rsmri"]]
names(tuning_input)<- modality_vec
glmn_tuning_all_RS_par <- tuning_input[["rsmri"]]
coefs_ridge_all <- resp_input %>% map(.,~extract_tibble(fit_ridge_all_RS_par[[.]], alpha_index = paste0("a",glmn_tuning_all_RS_par[[.]][["param_tune"]][["mixture"]])))
coefs_ridge_all <- coefs_ridge_all %>% map(.,~filter(.,pvalue < 0.05) %>%
mutate(.,type = ifelse(type == 'Null', 'Null permuted models', 'Target models'),roi=variable,roi_short= str_remove_all(variable,"^par")))
roi_num_ridge <- coefs_ridge_all %>% map(.,~dim(.)[1])
max_roi_ridge <- max(as.numeric(roi_num_ridge))
coefs_ridge_test <- coefs_ridge_all[[resp_input[1]]] %>% group_by(type)
coefs_ridge_test <- coefs_ridge_test%>% nest(-type)
coefs_ridge_test[[2]][[1]] <- coefs_ridge_test[[2]][[1]] %>% mutate(direction1 = ifelse(coefs_ridge_test[[2]][[1]]$wmean >= median(coefs_ridge_test[[2]][[1]]$wmean)|roi_num_ridge[[resp_input[1]]] <= floor(max_roi_ridge/2),"big","small"))
coefs_ridge_test[[2]][[2]] <- coefs_ridge_test[[2]][[2]] %>% mutate(direction1=coefs_ridge_test[[2]][[1]]$direction1)
coefs_ridge_test <- coefs_ridge_test %>% unnest()
coefs_ridge_all <- coefs_ridge_all%>%map(.,~group_by(.,type))
coefs_ridge_all <- coefs_ridge_all%>%map(.,~nest(.,-type))
for(i in 1:length(resp_input)){
coefs_ridge_all[[resp_input[i]]][["data"]][[2]]<-coefs_ridge_all[[resp_input[i]]][["data"]][[2]] %>% mutate(direction = ifelse(coefs_ridge_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_ridge_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_ridge[[resp_input[i]]] <= floor(max_roi_ridge/2),"big","small"))
coefs_ridge_all[[resp_input[i]]][["data"]][[1]] <- coefs_ridge_all[[resp_input[i]]][["data"]][[1]] %>% mutate(direction=coefs_ridge_all[[resp_input[i]]][["data"]][[2]]$direction)
}
coefs_ridge_all <- coefs_ridge_all %>%map(.,~unnest(.))
matrix_ridge<- coefs_ridge_all %>% map(.,~filter(., type=="Target models")%>% mutate(.,estimate=wmean)%>%matrix_prepare(.))
for(i in 1:length(resp_input)){
colnames(matrix_ridge[[resp_input[i]]]) <- roi_names_all
rownames(matrix_ridge[[resp_input[i]]])<- roi_names_all
}
matrix_ridge_test <- matrix_ridge%>% map(.,~as.data.frame(.)%>%mutate(roi = roi_names_all)%>%reshape2::melt(.,id.vars="roi"))
method_vec <- c("Enet")
matrix_list <- list(Enet = matrix_ridge_test)
matrix_list_name <- method_vec %>% map(.,function(method_used=.){
matrix_list_name <- resp_input%>%map(.,~mutate(matrix_list[[method_used]][[.]], method=method_used))
# matrix_list_name <- resp_input %>% map(.,~range(matrix_list_name[[method_used]][[.]]$value,na.rm=TRUE)fileDate = '31Dec2020')
return(matrix_list_name)
})
names(matrix_list_name)=method_vec
legend_all <- method_vec %>% map(.,function(method_used=.){
legend_all <- resp_input %>% map(.,~range(matrix_list_name[[method_used]][[.]]$value,na.rm=TRUE))
return(legend_all)
})
names(legend_all)<- method_vec
legend_all_test <- t(as.matrix (as.data.frame(legend_all)))
legend_test <- tibble("low"=legend_all_test[,1],"upp"=legend_all_test[,2],"resp"=resp_input, "method"=c(rep("Elastic Net",length(resp_input))))
legend_resp <- vector("list", length = length(resp_input))
names(legend_resp)<- resp_input
for(i in 1:length(resp_input)){
legend_resp[[resp_input[i]]]<- tibble("upp"= max(legend_test$upp[which(legend_test$resp==resp_input[i])]),
"low"=min(legend_test$low[which(legend_test$resp==resp_input[i])]))
}
resp_input%>% map(.,~ggplot(data =matrix_ridge_test[[.]], aes(x=variable, y=roi, fill=value)) +
geom_tile(color = "white")+
labs(x=NULL,y=NULL,title=paste0("rs-fMRI Predicting\n", plotting_names$longer_name[[which(plotting_names$response==.)]]))+
scale_fill_gradient2(low = muted("blue") , high =muted("red"), mid = "grey80",
midpoint = 0,na.value = "grey50", limit = c(legend_resp[[.]]$low, legend_resp[[.]]$upp), space = "Lab",
name="Parameter\nEstimation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
#axis.text.x = element_blank(),
#axis.text.y = element_blank(),
plot.title = element_text(size=17))+
coord_fixed())
}
enet_interval_plot_rsmri(data_input=enet_tuning_fit,tuning_input=enet_results_all,resp_input = resp_names,plotting_names=resp_var_plotting)
## $TFMRI_NB_ALL_BEH_C2B_RATE
##
## $NIHTBX_PICVOCAB_UNCORRECTED
##
## $NIHTBX_FLANKER_UNCORRECTED
##
## $NIHTBX_PATTERN_UNCORRECTED
##
## $NIHTBX_PICTURE_UNCORRECTED
##
## $NIHTBX_READING_UNCORRECTED
##
## $LMT_SCR_PERC_CORRECT
##
## $PEA_RAVLT_LD_TRIAL_VII_TC
##
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
enet_interval_plot_DTI <- function(data_input, tuning_input,resp_input,plotting_names=resp_var_plotting){
fit_enet_all_IQR <- data_input[["DTI"]]
names(tuning_input)<- modality_vec
glmn_tuning_all_IQR <- tuning_input[["DTI"]]
coefs_enet_all <- resp_input %>% map(.,~extract_tibble(fit_enet_all_IQR[[.]], alpha_index = paste0("a",glmn_tuning_all_IQR[[.]][["param_tune"]][["mixture"]])))
coefs_enet_all <- coefs_enet_all %>% map(.,~filter(.,pvalue < 0.05) %>%
mutate(.,type = ifelse(type == 'Null', 'Null permuted models', 'Target models'),
roi = str_remove(variable, "FA_")))
roi_num_enet <- coefs_enet_all %>% map(.,~dim(.)[1])
max_roi_enet <- max(as.numeric(roi_num_enet))
coefs_enet_test <- coefs_enet_all[[resp_input[1]]] %>% group_by(type)
coefs_enet_test <- coefs_enet_test%>% nest(-type)
coefs_enet_test[[2]][[1]] <- coefs_enet_test[[2]][[1]] %>% mutate(direction1 = ifelse(coefs_enet_test[[2]][[1]]$wmean >= median(coefs_enet_test[[2]][[1]]$wmean)|roi_num_enet[[resp_input[1]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_test[[2]][[2]] <- coefs_enet_test[[2]][[2]] %>% mutate(direction1=coefs_enet_test[[2]][[1]]$direction1)
coefs_enet_test <- coefs_enet_test %>% unnest()
coefs_enet_all <- coefs_enet_all%>%map(.,~group_by(.,type))
coefs_enet_all <- coefs_enet_all%>%map(.,~nest(.,-type))
for(i in 1:length(resp_input)){
coefs_enet_all[[resp_input[i]]][["data"]][[2]]<-coefs_enet_all[[resp_input[i]]][["data"]][[2]] %>% mutate(direction = ifelse(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_enet[[resp_input[i]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_all[[resp_input[i]]][["data"]][[1]] <- coefs_enet_all[[resp_input[i]]][["data"]][[1]] %>% mutate(direction=coefs_enet_all[[resp_input[i]]][["data"]][[2]]$direction)
}
coefs_enet_all <- coefs_enet_all %>%map(.,~unnest(.))
resp_input %>% map(.,~ggplot(coefs_enet_all[[.]], aes(x = fct_reorder(roi, wmean),
y = wmean,
ymax = wmean + 2 * wsd, ymin = wmean - 2 * wsd,
col = type)) +
geom_pointrange(fatten = 0.5, key_glyph = 'point') +
scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 2)) +
scale_color_grey(start = 0.7, end = 0.5) +
coord_flip() +
guides(colour = guide_legend(override.aes = list(size = 2.5)))+
labs(x = 'Explanatory Variables (Brain Regions)', y = 'Averaged Coefficient Across Models (±2 SD)', col = 'Model type',
title = paste0("DTI Predicting\n", plotting_names$longer_name[[which(plotting_names$response==.)]])) +
#facet_wrap(~ direction, scales = 'free_y') +
scale_color_manual(values = c("#56B4E9", "black"),labels = c("Permuted Null", "Target")) +
theme_bw() +
#theme(axis.text.y = element_text(angle = 15)) +
theme(legend.title=element_blank()) +
theme(legend.position = "top") +
theme(
axis.title.x = element_text(size = 15),
axis.text.x = element_text(size = 12),
axis.title.y = element_text(size = 15),
axis.text.y = element_text(size = 12),
legend.text = element_text(size = 15),
plot.title = element_text(size=15)) +
theme(
strip.background = element_blank(),
strip.text.x = element_blank()
)
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
)
}
enet_interval_plot_DTI(data_input=enet_tuning_fit,
tuning_input=enet_results_all,
resp_input = resp_names,
plotting_names=resp_var_plotting)
## $TFMRI_NB_ALL_BEH_C2B_RATE
##
## $NIHTBX_PICVOCAB_UNCORRECTED
##
## $NIHTBX_FLANKER_UNCORRECTED
##
## $NIHTBX_PATTERN_UNCORRECTED
##
## $NIHTBX_PICTURE_UNCORRECTED
##
## $NIHTBX_READING_UNCORRECTED
##
## $LMT_SCR_PERC_CORRECT
##
## $PEA_RAVLT_LD_TRIAL_VII_TC
##
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
Her we replace NA from each modality with unlikely small and large values, resulting in 2 predictors for each modality
## the following function just process the data to put into random forest model
rnd_for_data_prep = function(pred_select = "predicted",
data_list,
resp_chose,
resp_data=assessment(baseline_two_fold$splits[[1]]))
{
enet_pred <- plyr::join_all(list(
data_list[["MID_results"]][[resp_chose]][[pred_select]]%>%
dplyr::rename("MID_pred"="pred"),
data_list[["smri_results"]][[resp_chose]][[pred_select]]%>%
dplyr::rename("smri_pred"="pred"),
data_list[["SST_results"]][[resp_chose]][[pred_select]]%>%
dplyr::rename("SST_pred"="pred"),
data_list[["rsmri_results"]][[resp_chose]][[pred_select]]%>%
dplyr::rename("rsmri_pred"="pred"),
data_list[["Nback_results"]][[resp_chose]][[pred_select]]%>%
dplyr::rename("Nback_pred"="pred"),
data_list[["DTI_results"]][[resp_chose]][[pred_select]]%>%
dplyr::rename("DTI_pred"="pred")),
by='SUBJECTKEY', type='full') %>%
distinct(SUBJECTKEY, .keep_all = TRUE)
enet_pred_p <- enet_pred%>% mutate_all(~replace(., is.na(.), 1000))
enet_pred_n <- enet_pred%>% mutate_all(~replace(., is.na(.), -1000))
enet_pred_noNA <- left_join(enet_pred_p,enet_pred_n,by="SUBJECTKEY")
resp_data <- select(resp_data,"SUBJECTKEY",all_of(resp_chose)) %>%
distinct(SUBJECTKEY, .keep_all = TRUE) %>%
drop_na() %>%
mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x))
enet_pred_noNA <- left_join(enet_pred_noNA,resp_data,by="SUBJECTKEY")
enet_pred_noNA_noID <- select(enet_pred_noNA,-SUBJECTKEY)
return(list("ID"=enet_pred_noNA,"NoID"=enet_pred_noNA_noID))
}
enet_rf_valid_data <- resp_names %>%
map(.,~rnd_for_data_prep(data_list = enet_results_all,
resp_chose = all_of(.)))
enet_rf_test_baseline <- resp_names %>%
map(.,~rnd_for_data_prep(pred_select="predicted_baseline",
data_list = enet_results_all,
resp_chose = all_of(.),
resp_data = baseline_test))
enet_rf_test_followup <- resp_names %>%
map(.,~rnd_for_data_prep(pred_select="predicted_followup",
data_list = enet_results_all,
resp_chose = all_of(.),
resp_data = followup_analysis))
stack_datasize_extract <- function(resp_input, data_input){
one_allset <- data.frame(datasplit_number =dim(data_input[[resp_input]][["ID"]])[1],
response=resp_input)
return(one_allset)}
task_stack_valid <- resp_names %>%
map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_valid_data))%>%
do.call(rbind,.)%>%
mutate(number_valid = datasplit_number)%>%
subset(.,select=-c(datasplit_number))
task_stack_baseline <- resp_names %>%
map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_baseline))%>%
do.call(rbind,.)%>%
mutate(baseline_test = datasplit_number)%>%
subset(.,select=-c(datasplit_number))
task_stack_followup <- resp_names %>%
map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_followup))%>%
do.call(rbind,.)%>%
mutate(followup_test = datasplit_number)%>%
subset(.,select=-c(datasplit_number))
plyr::join_all(list(task_stack_valid,task_stack_baseline,task_stack_followup,resp_var_plotting),
type="left", by = "response")%>%
select("short_name","number_valid",ends_with("test"))%>%
kableExtra::kbl(caption = paste0("Sample size of all the data sets in stacked models ")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| short_name | number_valid | baseline_test | followup_test |
|---|---|---|---|
| 2-back Work Mem | 2480 | 4597 | 4902 |
| Pic Vocab | 2812 | 5240 | 5282 |
| Flanker | 2810 | 5235 | 5372 |
| Pattern Speed | 2804 | 5229 | 5351 |
| Seq Memory | 2809 | 5235 | 5368 |
| Reading Recog | 2809 | 5220 | 5269 |
| Little Man | 2808 | 5093 | 5458 |
| Audi Verbal | 2793 | 5182 | 5350 |
| integration SSRT | 1936 | 3725 | 4053 |
here we specified the 2nd layer model
random_forest_tuning <- function(resp,valid_list,baseline_test, followup_test){
rf_train_data <- valid_list[[resp]]
rf_test_baseline <- baseline_test[[resp]]
rf_test_followup <- followup_test[[resp]]
### to avoid repetition in coding, data frame lists are created with ID and without ID.
resp_var <- resp
##data recipe, assign "SUBJECTKEY" to ID (a class of variables neither belong to predictors nor response variable)
## do not scale the input data as 1000 and -1000 are put there on purpose
forest_rec <- recipe(as.formula(paste0(resp_var, "~ .")),
data = rf_train_data[["NoID"]]) %>%
step_dummy(all_nominal()) %>%
prep(training = rf_train_data[["NoID"]], retain = TRUE)
## mtry is the number of predictors to sample at each split
## min_n (the number of observations needed to keep splitting nodes)
tune_spec <-rand_forest(mtry = tune(),
trees = 1000,
min_n = tune()) %>%
set_mode("regression") %>%
set_engine("ranger")
tune_wf <- workflow() %>%
add_recipe(forest_rec) %>%
add_model(tune_spec)
set.seed(123456)
## nested data validation
all_rs <- rsample::vfold_cv(data=rf_train_data[["ID"]],v = 10,repeats = 1)
## automate generate grid for hyperparameters
#doParallel::registerDoParallel()
rf_grid <- grid_regular(
mtry(range = c(1, 12)), #12 is the highest number of predictors
min_n(range = c(1, 1000)),
levels = c(mtry = 12, min_n = 101) #mtry = 10 so that it increases one step at a time
)
tune_res <- tune_grid(
tune_wf,
resamples = all_rs,
metrics = metric_set(rmse),
grid = rf_grid
)
best_tune <- select_best(tune_res, metric = "rmse")
final_rf <- finalize_model(
tune_spec,
best_tune
)
forest_prep <- prep(forest_rec)
## this one fit the random forest model with the best tuned parameters
rf_fit <- final_rf %>%
set_engine("ranger",importance="permutation") %>%
parsnip::fit(as.formula(paste0(resp_var, "~ .")),
data = juice(forest_prep)
)
pred_baseline <- predict(rf_fit,new_data = rf_test_baseline[["NoID"]])
pred_followup <- predict(rf_fit,new_data = rf_test_followup[["NoID"]])
performance_baseline <- cor(pred_baseline$.pred,rf_test_baseline[["ID"]][[resp]])
performance_followup <- cor(pred_followup$.pred,rf_test_followup[["ID"]][[resp]])
best_tune <- best_tune %>% mutate(performance_baseline=performance_baseline,
performance_followup=performance_followup)
result_list <- list("param_tune"=best_tune,
"predicted_baseline"=tibble(SUBJECTKEY=rf_test_baseline[["ID"]]$SUBJECTKEY,
"pred_baseline"=pred_baseline$.pred),
"predicted_followup"=tibble(SUBJECTKEY=rf_test_followup[["ID"]]$SUBJECTKEY,
"pred_followup"=pred_followup$.pred),
"model_fit"=rf_fit)
return(result_list)
}
Here we actually put all of the fitted results into the random forest. It is time-consuming and computing-expensive to run, so not run by default.
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
rf_fit_all <- resp_names %>%
future_map(.,~random_forest_tuning(resp=.,
valid_list = enet_rf_valid_data,
baseline_test = enet_rf_test_baseline,
followup_test = enet_rf_test_followup),
.options = furrr_options(seed=TRUE),.progress = TRUE)
random_fores_output_print <-
function(rf_output,enet_output,response_vec,names_plotting){
summary_rf <- vector("list", length = length(response_vec))
names(summary_rf) <- response_vec
for(i in 1:length(response_vec)){
summary_rf[[response_vec[i]]]<- as_tibble(rf_output[[response_vec[i]]][["param_tune"]]) %>% slice(1)
}
## mtry is the number of predictors to sample at each split
## min_n (the number of observations needed to keep splitting nodes)
rf_perform_output <- summary_rf %>% bind_rows() %>%
mutate(respones = names_plotting$short_name)%>%
rename(., "#predictors/split"=mtry,
"#observations/split"=min_n,
"model#"=.config,
"Baseline (Pearson r)"=performance_baseline,
"Followup (Pearson r)"=performance_followup)
rf_result_vec <- paste0(mri_vec,"_results")
all_enet_performance <- response_vec %>%
map(.,function(resp){
summary_mri_all <- vector("list", length = length(rf_result_vec))
names(summary_mri_all)<-rf_result_vec
for(i in 1:length(rf_result_vec)){
summary_mri_all[[rf_result_vec[i]]]<-
as_tibble(enet_output[[rf_result_vec[i]]][[resp]][["param_tune"]]) %>% slice(1)
}
summary_mri_all <- summary_mri_all %>%
bind_rows() %>%
mutate(type = mri_vec)
return(summary_mri_all)
})
summary_rf_all <- summary_rf %>% bind_rows() %>% mutate(response=response_vec)
all_enet_performance_baseline <- all_enet_performance %>%
do.call(rbind,.) %>%
mutate(.,response = rep(response_vec, each=length(mri_vec))) %>%
select(type,baseline_performance,response) %>%
pivot_wider(names_from = type, values_from= c(baseline_performance))
all_enet_baseline <-
plyr::join_all(list(all_enet_performance_baseline,names_plotting,summary_rf_all),
type="left", by = "response") %>%
select(all_of(mri_vec),"short_name","performance_baseline")%>%
rename(random_forest_baseline = performance_baseline)
all_enet_performance_followup <- all_enet_performance%>% do.call(rbind,.) %>%
mutate(.,response = rep(response_vec, each=length(mri_vec))) %>%
select(type,followup_performance,response) %>%
pivot_wider(names_from = type, values_from= c(followup_performance))
all_enet_followup <-
plyr::join_all(list(all_enet_performance_followup,names_plotting,summary_rf_all),
type="left", by = "response") %>%
select(all_of(mri_vec),"short_name","performance_followup")%>%
rename(random_forest_followup = performance_followup)
return(list(rf_performance = rf_perform_output,rf_enet_baseline = all_enet_baseline,rf_enet_followup = all_enet_followup))
}
task_rf_results <- random_fores_output_print(rf_output = rf_fit_all,
enet_output = enet_results_all,
response_vec=resp_names,
names_plotting = resp_var_plotting)
task_rf_results[["rf_performance"]]%>%
rename("Response Variable" = respones, "Baseline" = "Baseline (Pearson r)", "Followup" = "Followup (Pearson r)") %>%
select("Response Variable", "Baseline","Followup", "#predictors/split", "#observations/split") %>%
mutate(across(2:3, round, 3)) %>%
kableExtra::kbl(caption = paste0("Performance (Pearson r) and Hyperparameters of Random Forest Stacking for Each Cognitive Task")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| Response Variable | Baseline | Followup | #predictors/split | #observations/split |
|---|---|---|---|---|
| 2-back Work Mem | 0.478 | 0.417 | 3 | 140 |
| Pic Vocab | 0.364 | 0.379 | 4 | 150 |
| Flanker | 0.183 | 0.199 | 4 | 490 |
| Pattern Speed | 0.193 | 0.189 | 2 | 420 |
| Seq Memory | 0.237 | 0.247 | 4 | 410 |
| Reading Recog | 0.311 | 0.311 | 12 | 280 |
| Little Man | 0.269 | 0.264 | 3 | 270 |
| Audi Verbal | 0.239 | 0.182 | 3 | 590 |
| integration SSRT | 0.292 | 0.333 | 6 | 350 |
task_rf_results[["rf_enet_baseline"]] %>%
rename("Response Variable" = short_name,
"NBack-fMRI" = Nback,
"SST-fMRI" = SST,
"MID-fMRI" = MID,
"rs-fMRI" = rsmri,
"sMRI" = smri,
"Stacked" = random_forest_baseline) %>%
select("Response Variable",
"Stacked",
"NBack-fMRI",
"MID-fMRI",
"SST-fMRI",
"rs-fMRI",
"sMRI",
"DTI") %>%
mutate(across(2:8, round, 3)) %>%
kableExtra::kbl(caption = paste0("Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Cognitive Task\n at the Baseline (Pearson r)")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| Response Variable | Stacked | NBack-fMRI | MID-fMRI | SST-fMRI | rs-fMRI | sMRI | DTI |
|---|---|---|---|---|---|---|---|
| 2-back Work Mem | 0.478 | 0.492 | 0.154 | 0.175 | 0.125 | 0.140 | 0.130 |
| Pic Vocab | 0.364 | 0.318 | 0.174 | 0.124 | 0.213 | 0.184 | 0.178 |
| Flanker | 0.183 | 0.199 | 0.065 | 0.018 | 0.057 | 0.033 | 0.079 |
| Pattern Speed | 0.193 | 0.182 | 0.080 | 0.032 | 0.087 | 0.090 | 0.121 |
| Seq Memory | 0.237 | 0.210 | 0.102 | NA | 0.095 | 0.112 | 0.120 |
| Reading Recog | 0.311 | 0.289 | 0.157 | 0.102 | 0.189 | 0.166 | 0.138 |
| Little Man | 0.269 | 0.243 | 0.079 | 0.095 | 0.114 | 0.131 | 0.116 |
| Audi Verbal | 0.239 | 0.151 | 0.093 | 0.076 | 0.124 | 0.094 | 0.098 |
| integration SSRT | 0.292 | 0.122 | 0.015 | 0.328 | 0.031 | 0.067 | 0.086 |
task_rf_results[["rf_enet_followup"]] %>%
rename("Response Variable" = short_name,
"NBack-fMRI" = Nback,
"SST-fMRI" = SST,
"MID-fMRI" = MID,
"rs-fMRI" = rsmri,
"sMRI" = smri,
"Stacked" = random_forest_followup) %>%
select("Response Variable",
"Stacked",
"NBack-fMRI",
"MID-fMRI",
"SST-fMRI",
"rs-fMRI",
"sMRI",
"DTI") %>%
mutate(across(2:8, round, 3)) %>%
kableExtra::kbl(caption = paste0("Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Cognitive Task\n at the Follow-Up (Pearson r)")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| Response Variable | Stacked | NBack-fMRI | MID-fMRI | SST-fMRI | rs-fMRI | sMRI | DTI |
|---|---|---|---|---|---|---|---|
| 2-back Work Mem | 0.417 | 0.480 | 0.132 | 0.195 | 0.185 | 0.177 | 0.199 |
| Pic Vocab | 0.379 | 0.298 | 0.128 | 0.147 | 0.251 | 0.226 | 0.190 |
| Flanker | 0.199 | 0.174 | 0.044 | 0.050 | 0.093 | 0.058 | 0.141 |
| Pattern Speed | 0.189 | 0.193 | 0.063 | 0.048 | 0.090 | 0.102 | 0.093 |
| Seq Memory | 0.247 | 0.228 | 0.098 | NA | 0.127 | 0.128 | 0.169 |
| Reading Recog | 0.311 | 0.284 | 0.126 | 0.092 | 0.205 | 0.174 | 0.151 |
| Little Man | 0.264 | 0.237 | 0.058 | 0.082 | 0.097 | 0.118 | 0.246 |
| Audi Verbal | 0.182 | 0.156 | 0.046 | 0.080 | 0.126 | 0.091 | 0.062 |
| integration SSRT | 0.333 | 0.160 | 0.017 | 0.371 | 0.086 | 0.077 | 0.120 |
enet_rf_valid_data_permu <- enet_rf_valid_data %>% map(.,function(data_input=.){
data_output<- data_input[["NoID"]] %>%
dplyr::rename("MID_large"="MID_pred.x",
"SST_large"="SST_pred.x",
"smri_large"="smri_pred.x",
"rsmri_large"="rsmri_pred.x",
"DTI_large"="DTI_pred.x",
"Nback_large"="Nback_pred.x",
"MID_small"="MID_pred.y",
"SST_small"="SST_pred.y",
"smri_small"="smri_pred.y",
"rsmri_small"="rsmri_pred.y",
"DTI_small"="DTI_pred.y",
"Nback_small"="Nback_pred.y")
return(data_output)
})
cf_fit <- resp_names %>% map(.,function(resp_var=.){
cforest(as.formula(paste0(resp_var, "~ .")),
data=enet_rf_valid_data_permu[[resp_var]],
control= cforest_unbiased(mtry=rf_fit_all[[resp_var]][["param_tune"]]$mtry,
ntree=1000,minsplit=rf_fit_all[[resp_var]][["param_tune"]]$min_n))
})
PI_permimp <-cf_fit %>% map(.,~permimp(.,progressBar = FALSE))
# resp_names %>% map(.,~plot(PI_permimp[[.]],
# type = "box",
# horizontal = TRUE,
# main=paste0("Conditional Permutation Importance ",
# resp_var_plotting$short_name[[which(resp_var_plotting$response==.)]])))
resp_names %>% map(.,~plot(PI_permimp[[.]],
type = "bar",
interval="quantile",
main=paste0("Conditional Permutation Importance ",
resp_var_plotting$short_name[[which(resp_var_plotting$response==.)]])))
## $TFMRI_NB_ALL_BEH_C2B_RATE
## NULL
##
## $NIHTBX_PICVOCAB_UNCORRECTED
## NULL
##
## $NIHTBX_FLANKER_UNCORRECTED
## NULL
##
## $NIHTBX_PATTERN_UNCORRECTED
## NULL
##
## $NIHTBX_PICTURE_UNCORRECTED
## NULL
##
## $NIHTBX_READING_UNCORRECTED
## NULL
##
## $LMT_SCR_PERC_CORRECT
## NULL
##
## $PEA_RAVLT_LD_TRIAL_VII_TC
## NULL
##
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
## NULL
random_forest_plotting_followup <- function(rf_fit, followup_data, resp_vec, naming_frame ){
followup_rf_pred <- resp_vec %>% map(.,function(resp_var=.){
resp_pred <- tibble(pred = rf_fit[[resp_var]][["predicted_followup"]][["pred_followup"]],
observed=followup_data[[resp_var]][["ID"]][[resp_var]],
SUBJECTKEY=followup_data[[resp_var]][["ID"]][["SUBJECTKEY"]])
return(resp_pred)
})
task_followup_pred <- resp_vec %>%
map(., ~ggplot(followup_rf_pred[[.]],
aes(x = scale(pred) ,
y = scale(observed))) +
geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
geom_smooth(method = 'lm', se = FALSE, col = 'black') +
labs(x = NULL,
y = NULL,
title = paste (naming_frame$short_name[[which(naming_frame$response==.)]],
'\nr = ',round(rf_fit[[.]][["param_tune"]][["performance_followup"]], 3)))
)
title_task_rf_pred_plot <- ggdraw() +
draw_label(
"Out-of-Sample Random Forest Stacking Predictive Ability at Followup",
fontface = 'bold',
x = 0,
hjust = 0
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 7)
)
followup_pred_all_figure<- plot_grid(title_task_rf_pred_plot,plot_grid(plotlist = task_followup_pred),nrow = 2 , rel_heights = c(0.1, 1))
ggpubr::annotate_figure(followup_pred_all_figure,left= ggpubr::text_grob("Observed Cognitive Performance (Z)",size=15,rot=90),
bottom = ggpubr::text_grob("Predicted Cognitive Performance (Z)",size=15))
}
random_forest_plotting_baseline <- function(rf_fit, baseline_data, resp_vec,naming_frame ){
baseline_rf_pred <- resp_vec %>% map(.,function(resp_var=.){
resp_pred <- tibble(pred = rf_fit[[resp_var]][["predicted_baseline"]][["pred_baseline"]],
observed=baseline_data[[resp_var]][["ID"]][[resp_var]],
SUBJECTKEY=baseline_data[[resp_var]][["ID"]][["SUBJECTKEY"]])
return(resp_pred)
})
task_baseline_pred <- resp_vec %>% map(., ~ggplot(baseline_rf_pred[[.]],aes(x = scale(pred) , y = scale(observed))) +
geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
geom_smooth(method = 'lm', se = FALSE, col = 'black') +
labs(x = NULL,
y = NULL,
title = paste (
naming_frame$short_name[[which(naming_frame$response==.)]]
,'\nr = ',round(rf_fit[[.]][["param_tune"]][["performance_baseline"]], 3)))
)
title_task_rf_pred_plot_baseline <- ggdraw() +
draw_label(
"Out-of-Sample Random Forest Stacking Predictive Ability at Baseline",
fontface = 'bold',
x = 0,
hjust = 0
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 7)
)
baseline_pred_all_figure<- plot_grid(title_task_rf_pred_plot_baseline,
plot_grid(plotlist = task_baseline_pred),
nrow = 2 ,
rel_heights = c(0.1, 1))
ggpubr::annotate_figure(baseline_pred_all_figure,
left= ggpubr::text_grob("Observed Cognitive Performance (Z)",size=15,rot=90),
bottom = ggpubr::text_grob("Predicted Cognitive Performance (Z)",size=15))
}
random_forest_plotting_baseline(rf_fit =rf_fit_all,baseline_data=enet_rf_test_baseline,naming_frame = resp_var_plotting , resp_vec= resp_names)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
random_forest_plotting_followup(rf_fit =rf_fit_all,followup_data=enet_rf_test_followup,naming_frame = resp_var_plotting, resp_vec= resp_names)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
here cfa is used to compute g factor.
We fit two models (first order and second order models) onto the training data, and apply the fit to the validation (2nd-layer) and test sets (baseline and followup). This is to protect us from dataleakage.
# we only inlcuded task that are avaliable on two time points
TaskDVs1Batch = c("NIHTBX_PICVOCAB_UNCORRECTED",
"NIHTBX_READING_UNCORRECTED",
"NIHTBX_FLANKER_UNCORRECTED",
"NIHTBX_PATTERN_UNCORRECTED",
"NIHTBX_PICTURE_UNCORRECTED",
"PEA_RAVLT_LD_TRIAL_VII_TC"
)
CFA_lav_output_processing <- function(data_input,CFA_model1,CFA_model2){
first_order_output <- lavaan::lavPredict(CFA_model1, newdata =data_input )
second_order_output <- lavaan::lavPredict(CFA_model2, newdata = data_input)
### getting SUBJECTKEY for CFA model prediction
### the number of columns of CFA predictions and task performance data matched
task_data <- select(data_input,all_of(TaskDVs1Batch),"SUBJECTKEY")%>% drop_na()
CFA_lav <- tibble(SUBJECTKEY =task_data$SUBJECTKEY,
Lang_first_order = first_order_output[,1],
CogFlex_first_order = first_order_output[,2],
MemRe_first_order = first_order_output[,3],
g_second_order =second_order_output[,4])
return(CFA_lav)
}
CFA_lav_response <- function(data_input = baseline_two_fold$splits[[1]], test_baseline =baseline_test, test_followup=followup_analysis ){
## training the model and for enet parameter tuning
data_train <- analysis(data_input)%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## random forest tuning
data_valid <- assessment(data_input)%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## test set for baseline observations
data_baseline <- test_baseline%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## test set for followup observations
data_followup <- test_followup%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
### first order CFA model
NeuroCog1stOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'
NeuroCog1stOrder.Fit <- lavaan::cfa(model = NeuroCog1stOrder, data = data_train,estimator="MLR")
### second order CFA model
NeuroCog2ndOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC
g =~ NA*Language + CognitiveFlexibity + MemoryRecall #estimate the loading of GenAbi -> as opposed to using it as a marker
g ~~ 1*g #need to constrain variance to 1'
NeuroCog2ndOrder.Fit <- lavaan::cfa(model = NeuroCog2ndOrder, data = data_train,estimator="MLR")
train_CFA_lav <- CFA_lav_output_processing(data_input = data_train,
CFA_model1 = NeuroCog1stOrder.Fit,
CFA_model2 = NeuroCog2ndOrder.Fit)
valid_CFA_lav <- CFA_lav_output_processing(data_input = data_valid,
CFA_model1 = NeuroCog1stOrder.Fit,
CFA_model2 = NeuroCog2ndOrder.Fit)
baseline_CFA_lav <- CFA_lav_output_processing(data_input = data_baseline,
CFA_model1 = NeuroCog1stOrder.Fit,
CFA_model2 = NeuroCog2ndOrder.Fit)
followup_CFA_lav <- CFA_lav_output_processing(data_input = data_followup,
CFA_model1 = NeuroCog1stOrder.Fit,
CFA_model2 = NeuroCog2ndOrder.Fit)
output_list <- list(enet_tuning = train_CFA_lav,
RanFor_tuning = valid_CFA_lav,
testing_baseline = baseline_CFA_lav,
testing_followup=followup_CFA_lav)
return(list(output_list, NeuroCog1stOrder.Fit, NeuroCog2ndOrder.Fit))
}
CFA_list <- CFA_lav_response(data_input = baseline_two_fold$splits[[1]],
test_baseline =baseline_test,
test_followup=followup_analysis)
CFA_response <- CFA_list[[1]]
NeuroCog1stOrder.Fit <- CFA_list[[2]]
NeuroCog2ndOrder.Fit <- CFA_list[[3]]
lavaan::summary(NeuroCog1stOrder.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)
## lavaan 0.6-7 ended normally after 31 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 15
##
## Number of observations 2902
##
## Model Test User Model:
## Standard Robust
## Test Statistic 20.462 20.029
## Degrees of freedom 6 6
## P-value (Chi-square) 0.002 0.003
## Scaling correction factor 1.022
## Yuan-Bentler correction (Mplus variant)
##
## Model Test Baseline Model:
##
## Test statistic 2949.557 2772.060
## Degrees of freedom 15 15
## P-value 0.000 0.000
## Scaling correction factor 1.064
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.995 0.995
## Tucker-Lewis Index (TLI) 0.988 0.987
##
## Robust Comparative Fit Index (CFI) 0.995
## Robust Tucker-Lewis Index (TLI) 0.988
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -23239.010 -23239.010
## Scaling correction factor 1.121
## for the MLR correction
## Loglikelihood unrestricted model (H1) -23228.779 -23228.779
## Scaling correction factor 1.093
## for the MLR correction
##
## Akaike (AIC) 46508.020 46508.020
## Bayesian (BIC) 46597.617 46597.617
## Sample-size adjusted Bayesian (BIC) 46549.957 46549.957
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.029 0.028
## 90 Percent confidence interval - lower 0.016 0.015
## 90 Percent confidence interval - upper 0.043 0.042
## P-value RMSEA <= 0.05 0.994 0.996
##
## Robust RMSEA 0.029
## 90 Percent confidence interval - lower 0.015
## 90 Percent confidence interval - upper 0.043
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.014 0.014
##
## Parameter Estimates:
##
## Standard errors Sandwich
## Information bread Observed
## Observed information based on Hessian
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Language =~
## NIHTBX_PICVOCA 1.000 0.712 0.712
## NIHTBX_READING 1.042 0.048 21.683 0.000 0.742 0.742
## CognitiveFlexibity =~
## NIHTBX_FLANKER 1.000 0.721 0.721
## NIHTBX_PATTERN 0.749 0.058 12.814 0.000 0.541 0.541
## MemoryRecall =~
## NIHTBX_PICTURE 1.000 0.603 0.603
## PEA_RAVLT_LD_T 1.152 0.068 16.872 0.000 0.694 0.695
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Language ~~
## CognitivFlxbty 0.266 0.019 14.102 0.000 0.518 0.518
## MemoryRecall 0.272 0.017 15.811 0.000 0.633 0.633
## CognitiveFlexibity ~~
## MemoryRecall 0.193 0.017 11.130 0.000 0.443 0.443
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .NIHTBX_PICVOCA 0.493 0.025 19.590 0.000 0.493 0.493
## .NIHTBX_READING 0.449 0.026 17.058 0.000 0.449 0.449
## .NIHTBX_FLANKER 0.479 0.040 12.110 0.000 0.479 0.479
## .NIHTBX_PATTERN 0.707 0.030 23.896 0.000 0.707 0.708
## .NIHTBX_PICTURE 0.637 0.026 24.382 0.000 0.637 0.637
## .PEA_RAVLT_LD_T 0.517 0.032 16.296 0.000 0.517 0.518
## Language 0.507 0.031 16.267 0.000 1.000 1.000
## CognitivFlxbty 0.520 0.047 11.106 0.000 1.000 1.000
## MemoryRecall 0.363 0.028 12.876 0.000 1.000 1.000
##
## R-Square:
## Estimate
## NIHTBX_PICVOCA 0.507
## NIHTBX_READING 0.551
## NIHTBX_FLANKER 0.521
## NIHTBX_PATTERN 0.292
## NIHTBX_PICTURE 0.363
## PEA_RAVLT_LD_T 0.482
Plabels = c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT","Language","Mental\nFlexibity","Memory\nRecall")
semPlot::semPaths(object=NeuroCog1stOrder.Fit,
intercepts = F, residuals = F,
whatLabels="no",
what = "std",
layout="tree",
node.width = 1.4,
edge.label.cex = .6,
nodeLabels=Plabels,
edge.color="black",
sizeMan = 10, sizeLat = 10)
## Registered S3 methods overwritten by 'lme4':
## method from
## cooks.distance.influence.merMod car
## influence.merMod car
## dfbeta.influence.merMod car
## dfbetas.influence.merMod car
## Registered S3 methods overwritten by 'BDgraph':
## method from
## plot.sim lava
## print.sim lava
## Registered S3 methods overwritten by 'huge':
## method from
## plot.roc pROC
## plot.sim BDgraph
## print.roc pROC
## print.sim BDgraph
lavaan::summary(NeuroCog2ndOrder.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)
## lavaan 0.6-7 ended normally after 29 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 15
##
## Number of observations 2902
##
## Model Test User Model:
## Standard Robust
## Test Statistic 20.462 20.029
## Degrees of freedom 6 6
## P-value (Chi-square) 0.002 0.003
## Scaling correction factor 1.022
## Yuan-Bentler correction (Mplus variant)
##
## Model Test Baseline Model:
##
## Test statistic 2949.557 2772.060
## Degrees of freedom 15 15
## P-value 0.000 0.000
## Scaling correction factor 1.064
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.995 0.995
## Tucker-Lewis Index (TLI) 0.988 0.987
##
## Robust Comparative Fit Index (CFI) 0.995
## Robust Tucker-Lewis Index (TLI) 0.988
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -23239.010 -23239.010
## Scaling correction factor 1.121
## for the MLR correction
## Loglikelihood unrestricted model (H1) -23228.779 -23228.779
## Scaling correction factor 1.093
## for the MLR correction
##
## Akaike (AIC) 46508.020 46508.020
## Bayesian (BIC) 46597.617 46597.617
## Sample-size adjusted Bayesian (BIC) 46549.957 46549.957
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.029 0.028
## 90 Percent confidence interval - lower 0.016 0.015
## 90 Percent confidence interval - upper 0.043 0.042
## P-value RMSEA <= 0.05 0.994 0.996
##
## Robust RMSEA 0.029
## 90 Percent confidence interval - lower 0.015
## 90 Percent confidence interval - upper 0.043
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.014 0.014
##
## Parameter Estimates:
##
## Standard errors Sandwich
## Information bread Observed
## Observed information based on Hessian
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Language =~
## NIHTBX_PICVOCA 1.000 0.712 0.712
## NIHTBX_READING 1.042 0.048 21.683 0.000 0.742 0.742
## CognitiveFlexibity =~
## NIHTBX_FLANKER 1.000 0.721 0.721
## NIHTBX_PATTERN 0.749 0.058 12.814 0.000 0.541 0.541
## MemoryRecall =~
## NIHTBX_PICTURE 1.000 0.603 0.603
## PEA_RAVLT_LD_T 1.152 0.068 16.872 0.000 0.694 0.695
## g =~
## Language 0.613 0.029 20.816 0.000 0.860 0.860
## CognitivFlxbty 0.434 0.025 17.213 0.000 0.602 0.602
## MemoryRecall 0.444 0.027 16.494 0.000 0.736 0.736
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## g 1.000 1.000 1.000
## .NIHTBX_PICVOCA 0.493 0.025 19.590 0.000 0.493 0.493
## .NIHTBX_READING 0.449 0.026 17.058 0.000 0.449 0.449
## .NIHTBX_FLANKER 0.479 0.040 12.110 0.000 0.479 0.479
## .NIHTBX_PATTERN 0.707 0.030 23.896 0.000 0.707 0.708
## .NIHTBX_PICTURE 0.637 0.026 24.382 0.000 0.637 0.637
## .PEA_RAVLT_LD_T 0.517 0.032 16.296 0.000 0.517 0.518
## .Language 0.132 0.029 4.505 0.000 0.260 0.260
## .CognitivFlxbty 0.332 0.039 8.535 0.000 0.637 0.637
## .MemoryRecall 0.166 0.020 8.361 0.000 0.458 0.458
##
## R-Square:
## Estimate
## NIHTBX_PICVOCA 0.507
## NIHTBX_READING 0.551
## NIHTBX_FLANKER 0.521
## NIHTBX_PATTERN 0.292
## NIHTBX_PICTURE 0.363
## PEA_RAVLT_LD_T 0.482
## Language 0.740
## CognitivFlxbty 0.363
## MemoryRecall 0.542
Plabels = c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT","Language","Mental\nFlexibity","Memory\nRecall","G")
semPlot::semPaths(object=NeuroCog2ndOrder.Fit,intercepts = F, residuals = F,
whatLabels="no", what = "std", layout="tree", node.width = 1.4,
edge.label.cex = .6, nodeLabels=Plabels, edge.color="black",
sizeMan = 10, sizeLat = 10)
### reliabity of the G-Factor
semTools::reliabilityL2(NeuroCog2ndOrder.Fit,"g")
## omegaL1 omegaL2 partialOmegaL1
## 0.6103282 0.7792178 0.7282842
CFA_Resp_Var <- names(CFA_response[["enet_tuning"]])[-1]%>% set_names()
train_CFA <- left_join(CFA_response[["enet_tuning"]],
analysis( baseline_two_fold$splits[[1]]),
by = "SUBJECTKEY")
valid_CFA <- left_join(CFA_response[["RanFor_tuning"]],
assessment( baseline_two_fold$splits[[1]]),
by = "SUBJECTKEY")
baseline_CFA <- left_join(CFA_response[["testing_baseline"]],
baseline_test,
by = "SUBJECTKEY")
followup_CFA <- left_join(CFA_response[["testing_followup"]],
followup_analysis,
by = "SUBJECTKEY")
nrow(train_CFA)
## [1] 2906
nrow(valid_CFA)
## [1] 2917
nrow(baseline_CFA )
## [1] 5427
nrow(followup_CFA )
## [1] 5280
GDistData <- data.table::data.table(G_Factor = train_CFA$g_second_order, Split = "1st-Layer Train") %>%
rbind(data.table::data.table(G_Factor = valid_CFA$g_second_order, Split = "2nd-Layer Train")) %>%
rbind(data.table::data.table(G_Factor = baseline_CFA$g_second_order, Split = "Baseline Test")) %>%
rbind(data.table::data.table(G_Factor = followup_CFA$g_second_order, Split = "Follow-Up Test"))
ggplot(data = GDistData, aes(x = G_Factor, color = Split)) +
geom_density() +
labs(x= "G-Factor") +
labs(y= "Density") +
theme(legend.title=element_blank()) +
theme(text = element_text(size=30)) +
theme(legend.position="top", legend.box="vertical", legend.margin=margin()) +
guides(color=guide_legend(nrow=2,byrow=TRUE))
CFA_var_plotting <- tibble("response" =CFA_Resp_Var,
"longer_name"=c("1st-Order Language",
"1st-Order Mental Flexibility",
"1st-Order Memory Recall",
"2nd-Order G-Factor"),
"short_name"=c("Language",
"Mental Flexibility",
"Memory Recall",
"G-Factor"))
Nback_enet_CFA <- CFA_Resp_Var %>%
map(.,~enet_processing_nocom(resp_var = .,
split_train = train_CFA,
split_valid = valid_CFA,
split_test = baseline_CFA,
followup_test = followup_CFA,
contrast_name = "X2backvs0back_ROI_"))
SST_enet_CFA <- CFA_Resp_Var %>%
map(.,~enet_processing_nocom(resp_var = .,
split_train = train_CFA,
split_valid = valid_CFA,
split_test = baseline_CFA,
followup_test = followup_CFA ,
contrast_name = "anystopvscorrectgo_ROI_"))
MID_enet_CFA <- CFA_Resp_Var %>%
map(.,~enet_processing_nocom(resp_var = .,
split_train = train_CFA,
split_valid = valid_CFA,
split_test = baseline_CFA,
followup_test = followup_CFA,
contrast_name = "antiRewVsNeu_ROI_"))
DTI_enet_CFA <- CFA_Resp_Var %>%
map(.,~enet_processing_comOne(resp_var = .,
split_train = train_CFA,
split_valid = valid_CFA,
split_test = baseline_CFA,
followup_test = followup_CFA,
measure_one = "FA_"))
smri_enet_CFA <- CFA_Resp_Var %>%
map(.,~enet_processing_com(resp_var = .,
split_train = train_CFA,
split_valid = valid_CFA,
split_test = baseline_CFA,
followup_test = followup_CFA,
measure_one = "ASEG_",
measure_two = "Dest_Thick_"))
rsmri_enet_CFA <- CFA_Resp_Var %>%
map(.,~enet_processing_com(resp_var = .,
split_train = train_CFA,
split_valid = valid_CFA,
split_test = baseline_CFA,
followup_test = followup_CFA,
measure_one = "par",
measure_two = "Within"))
enet_CFA_data <- list("Nback"=Nback_enet_CFA,
"SST"=SST_enet_CFA,
"MID"=MID_enet_CFA,
"rsmri"=rsmri_enet_CFA,
"smri"=smri_enet_CFA,
"DTI"=DTI_enet_CFA)
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
Nback_CFA <- CFA_Resp_Var %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = Nback_enet_CFA[[.]]),
.options = furrr_options(seed=TRUE))
SST_CFA <- CFA_Resp_Var %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = SST_enet_CFA[[.]]),
.options = furrr_options(seed=TRUE))
MID_CFA <- CFA_Resp_Var %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = MID_enet_CFA[[.]]),
.options = furrr_options(seed=TRUE))
DTI_CFA <- CFA_Resp_Var %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = DTI_enet_CFA[[.]]),
.options = furrr_options(seed=TRUE))
smri_CFA <- CFA_Resp_Var %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = smri_enet_CFA[[.]]),
.options = furrr_options(seed=TRUE))
rsmri_CFA <- CFA_Resp_Var %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = rsmri_enet_CFA[[.]]),
.options = furrr_options(seed=TRUE))
enet_CFA <- list("Nback_results"=Nback_CFA,
"SST_results"=SST_CFA,
"MID_results"=MID_CFA,
"rsmri_results"=rsmri_CFA,
"smri_results"=smri_CFA,
"DTI_results"=DTI_CFA)
datasplits_number(data_list = enet_CFA,resp_vec = CFA_Resp_Var, resp_plotting = CFA_var_plotting)
[[1]]
| modality | enet_train | stack_train | baseline | followup | short_name |
|---|---|---|---|---|---|
| Nback | 1369 | 1331 | 2653 | 2996 | Language |
| Nback | 1369 | 1331 | 2653 | 2996 | Mental Flexibility |
| Nback | 1369 | 1331 | 2653 | 2996 | Memory Recall |
| Nback | 1369 | 1331 | 2653 | 2996 | G-Factor |
| modality | enet_train | stack_train | baseline | followup | short_name |
|---|---|---|---|---|---|
| SST | 1412 | 1421 | 2791 | 2903 | Language |
| SST | 1412 | 1421 | 2791 | 2903 | Mental Flexibility |
| SST | 1412 | 1421 | 2791 | 2903 | Memory Recall |
| SST | 1412 | 1421 | 2791 | 2903 | G-Factor |
| modality | enet_train | stack_train | baseline | followup | short_name |
|---|---|---|---|---|---|
| MID | 1611 | 1591 | 3138 | 3127 | Language |
| MID | 1611 | 1591 | 3138 | 3127 | Mental Flexibility |
| MID | 1611 | 1591 | 3138 | 3127 | Memory Recall |
| MID | 1611 | 1591 | 3138 | 3127 | G-Factor |
| modality | enet_train | stack_train | baseline | followup | short_name |
|---|---|---|---|---|---|
| rsmri | 1988 | 2007 | 3779 | 4137 | Language |
| rsmri | 1988 | 2007 | 3779 | 4137 | Mental Flexibility |
| rsmri | 1988 | 2007 | 3779 | 4137 | Memory Recall |
| rsmri | 1988 | 2007 | 3779 | 4137 | G-Factor |
| modality | enet_train | stack_train | baseline | followup | short_name |
|---|---|---|---|---|---|
| smri | 2395 | 2428 | 4507 | 4569 | Language |
| smri | 2395 | 2428 | 4507 | 4569 | Mental Flexibility |
| smri | 2395 | 2428 | 4507 | 4569 | Memory Recall |
| smri | 2395 | 2428 | 4507 | 4569 | G-Factor |
| modality | enet_train | stack_train | baseline | followup | short_name |
|---|---|---|---|---|---|
| DTI | 2268 | 2260 | 4071 | 4230 | Language |
| DTI | 2268 | 2260 | 4071 | 4230 | Mental Flexibility |
| DTI | 2268 | 2260 | 4071 | 4230 | Memory Recall |
| DTI | 2268 | 2260 | 4071 | 4230 | G-Factor |
modality_enet_CFA <-names(enet_CFA)
param_tune_enet <- modality_enet_CFA %>%
map(.,function(modality_input=.,list_input=enet_CFA){
tuned_results <- list_input[[modality_input]][["g_second_order"]][["param_tune"]]%>%
mutate(modality=modality_input)
return(tuned_results)
})%>% do.call(rbind,.)%>%
select("penalty", "mixture","modality")
param_tune_enet %>%
kableExtra::kbl(caption = "Tuned parameters for elastic net") %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| penalty | mixture | modality |
|---|---|---|
| 1.1489510 | 0.0 | Nback_results |
| 0.0195640 | 1.0 | SST_results |
| 0.0252354 | 1.0 | MID_results |
| 0.6080224 | 0.0 | rsmri_results |
| 0.1162322 | 0.1 | smri_results |
| 0.1162322 | 0.0 | DTI_results |
# mean of penalty term 0.6190428
param_tune_enet %>% summarise_at("penalty", mean)
## # A tibble: 1 x 1
## penalty
## <dbl>
## 1 0.339
# sd of penalty term 0.4316607
param_tune_enet %>% summarise_at("penalty", sd)
## # A tibble: 1 x 1
## penalty
## <dbl>
## 1 0.453
enetXplorer allows us to use permutation to test the significance of each feature
enet_CFA_xplorer <- enet_CFA
names(enet_CFA_xplorer) <- modality_vec
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
enet_CFA_fit <- modality_vec %>%
map(.,~fitting_enetxplorer(resp_vec = CFA_Resp_Var,
modality_var = .,
data_input = enet_CFA_data,
tuning_input = enet_CFA_xplorer))
names(enet_CFA_fit)<- modality_vec
modality_vec %>% map(.,~enet_fit_table(modality_input=.,
fit_input = enet_CFA_fit,
tune_input = enet_CFA_xplorer,
resp_input=CFA_Resp_Var,
plotting_names = CFA_var_plotting))
[[1]]
| Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
|---|---|---|---|---|
| 0 | 1.1486 | 0.4058 | 0.0004 | Language |
| 0 | 2.0159 | 0.2895 | 0.0004 | Mental Flexibility |
| 0 | 2.1021 | 0.3251 | 0.0004 | Memory Recall |
| 0 | 1.2884 | 0.4097 | 0.0004 | G-Factor |
| Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
|---|---|---|---|---|
| 1.0 | 0.0076 | 0.1731 | 0.0004 | Language |
| 1.0 | 0.0211 | 0.1447 | 0.0004 | Mental Flexibility |
| 0.1 | 0.0087 | 0.1218 | 0.0012 | Memory Recall |
| 1.0 | 0.0071 | 0.1784 | 0.0004 | G-Factor |
| Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
|---|---|---|---|---|
| 1 | 0.0284 | 0.2756 | 0.0004 | Language |
| 0 | 2.2486 | 0.1614 | 0.0004 | Mental Flexibility |
| 0 | 1.5746 | 0.2128 | 0.0004 | Memory Recall |
| 1 | 0.0236 | 0.2649 | 0.0004 | G-Factor |
| Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
|---|---|---|---|---|
| 0 | 0.3352 | 0.2410 | 0.0004 | Language |
| 0 | 1.2950 | 0.1434 | 0.0004 | Mental Flexibility |
| 0 | 1.2108 | 0.1681 | 0.0004 | Memory Recall |
| 0 | 0.4319 | 0.2281 | 0.0004 | G-Factor |
| Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
|---|---|---|---|---|
| 0.1 | 0.0913 | 0.2483 | 0.0004 | Language |
| 0.0 | 2.6983 | 0.1540 | 0.0004 | Mental Flexibility |
| 0.0 | 2.0348 | 0.1881 | 0.0004 | Memory Recall |
| 0.1 | 0.1134 | 0.2387 | 0.0004 | G-Factor |
| Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
|---|---|---|---|---|
| 0 | 0.0128 | 0.2370 | 0.0004 | Language |
| 1 | 0.0002 | 0.1776 | 0.0004 | Mental Flexibility |
| 0 | 0.0394 | 0.1758 | 0.0004 | Memory Recall |
| 0 | 0.0134 | 0.2292 | 0.0004 | G-Factor |
map2(.x=modality_type_vec,
.y=str_type_vec,
~enet_interval_plot(data_input=enet_CFA_fit,
tuning_input=enet_CFA_xplorer,
resp_input =CFA_Resp_Var,
modality_input = .x,
str_input = .y,
plotting_names = CFA_var_plotting))
## [[1]]
## [[1]]$Lang_first_order
##
## [[1]]$CogFlex_first_order
##
## [[1]]$MemRe_first_order
##
## [[1]]$g_second_order
##
##
## [[2]]
## [[2]]$Lang_first_order
##
## [[2]]$CogFlex_first_order
##
## [[2]]$MemRe_first_order
##
## [[2]]$g_second_order
##
##
## [[3]]
## [[3]]$Lang_first_order
##
## [[3]]$CogFlex_first_order
##
## [[3]]$MemRe_first_order
##
## [[3]]$g_second_order
enet_interval_plot_smri(data_input=enet_CFA_fit,tuning_input=enet_CFA,resp_input = CFA_Resp_Var, plotting_names = CFA_var_plotting)
## $Lang_first_order
##
## $CogFlex_first_order
##
## $MemRe_first_order
##
## $g_second_order
### Plot results from eNetExplorer with rs-fMRI predicting each factor score
enet_interval_plot_rsmri(data_input=enet_CFA_fit,tuning_input=enet_CFA,resp_input = CFA_Resp_Var, plotting_names = CFA_var_plotting)
## $Lang_first_order
##
## $CogFlex_first_order
##
## $MemRe_first_order
##
## $g_second_order
enet_interval_plot_DTI(data_input=enet_CFA_fit,tuning_input=enet_CFA,resp_input = CFA_Resp_Var, plotting_names = CFA_var_plotting)
## $Lang_first_order
##
## $CogFlex_first_order
##
## $MemRe_first_order
##
## $g_second_order
library(ggseg)
library(ggsegExtra)
library(ggsegDesterieux)
modality_type_vec <- list("Nback","SST","MID","smri")
# modality_word_vec <- list("NBack-fMRI", "SST-fMRI","MID-fMRI","sMRI")
modality_type_word_vec <- list(c("Nback","NBack-fMRI"),
c("SST","SST-fMRI"),
c("MID","MID-fMRI"),
c("smri","sMRI"))
str_type_vec <- list("X2backvs0back_ROI_",
"anystopvscorrectgo_ROI_",
"antiRewVsNeu_ROI_",
c("ASEG_","Dest_Thick_"))
#CFA_Resp_vec <- c("Lang_first_order", "CogFlex_first_order", "MemRe_first_order", "g_second_order")
brainPrepTibFunc <- function(enet_CFA_fit,
modality_type,
str_type,
CFA_Resp) {
if (modality_type[1] == "smri") {
brainPlotTib <- extract_tibble(enet_CFA_fit[[modality_type[1]]][[CFA_Resp]],
alpha_index = paste0("a",
enet_CFA_xplorer[[modality_type[1]]][[CFA_Resp]][["param_tune"]][["mixture"]])) %>%
filter(type == "Target") %>%
mutate(.,label = str_replace_all(variable, str_type[1], '')) %>%
mutate(.,label = str_replace_all(label, str_type[2], '')) %>%
mutate(.,label = str_replace_all(label, '\\.', '-')) %>%
mutate(.,label = str_replace_all(label, 'Brain-Stem', 'brain-stem')) %>%
mutate(.,label = str_replace_all(label, 'Right-Cerebellum-Cortex', 'right-cerebellum-cortex'))
}
else{
brainPlotTib <- extract_tibble(enet_CFA_fit[[modality_type[1]]][[CFA_Resp]],
alpha_index = paste0("a",
enet_CFA_xplorer[[modality_type[1]]][[CFA_Resp]][["param_tune"]][["mixture"]])) %>%
filter(type == "Target") %>%
mutate(.,label = str_replace_all(variable, str_type, '')) %>%
mutate(.,label = str_replace_all(label, '\\.', '-')) %>%
mutate(.,label = str_replace_all(label, 'Brain-Stem', 'brain-stem')) %>%
mutate(.,label = str_replace_all(label, 'Right-Cerebellum-Cortex', 'right-cerebellum-cortex'))
}
coefs_ggsegDes_all <-
ggsegDesterieux::desterieux %>% as_tibble %>%
select(label) %>%
na.omit() %>%
left_join(select(brainPlotTib,
label,
wmean,
pvalue,
variable), by = "label")
coefs_ggsegAseg_all <-
ggseg::aseg$data %>% as_tibble %>%
select(label) %>%
na.omit() %>%
left_join(select(brainPlotTib,
label,
wmean,
pvalue,
variable), by = "label")
coefs_ggsegMax <- max(max(coefs_ggsegDes_all$wmean, na.rm = TRUE),
max(coefs_ggsegAseg_all$wmean, na.rm = TRUE))
coefs_ggsegMin <- min(min(coefs_ggsegDes_all$wmean, na.rm = TRUE),
min(coefs_ggsegAseg_all$wmean, na.rm = TRUE))
plot_ggsegDes_all <-
ggsegDesterieux::desterieux %>% as_tibble %>%
right_join(coefs_ggsegDes_all, by ="label") %>%
select(label, wmean, pvalue) %>% filter(pvalue <.05) %>%
ggseg(atlas = 'desterieux',
mapping = aes(fill = wmean),
colour="black"
) +
theme_void() +
scale_fill_gradient2(
limits = c(coefs_ggsegMin, coefs_ggsegMax),
midpoint = 0, low = "blue", mid = "white",
high = "red", space = "Lab", na.value="transparent" ) +
theme(legend.position = "none")+
labs(title =paste0(modality_type[2]))
plot_ggsegAseg_all <-
ggseg::aseg$data %>% as_tibble %>%
right_join(coefs_ggsegAseg_all, by ="label") %>%
select(label, wmean, pvalue) %>% filter(pvalue <.05) %>%
ggseg(atlas="aseg",
mapping=aes(fill=wmean),
view = "axial",
colour="black") +
theme_void() +
scale_fill_gradient2(
limits = c(coefs_ggsegMin, coefs_ggsegMax),
midpoint = 0, low = "blue", mid = "white",
high = "red", space = "Lab", na.value="transparent" ) +
guides(fill = guide_colourbar(barwidth = 0.5, barheight = 3, title = NULL))
theme(legend.position = "none")
plotPreptaskfMRI <-
list(brainPlotTib,
coefs_ggsegDes_all,
coefs_ggsegAseg_all,
coefs_ggsegMax,
coefs_ggsegMin,
plot_ggsegDes_all,
plot_ggsegAseg_all) %>%
purrr::set_names("brainPlotTib","coefs_ggsegDes_all", "coefs_ggsegAseg_all",
"coefs_ggsegMax","coefs_ggsegMin","plot_ggsegDes_all", "plot_ggsegAseg_all")
return(plotPreptaskfMRI)
}
plotPreptaskfMRI <-
modality_type_word_vec %>%
purrr::set_names(purrr::map(.,1)) %>%
purrr::map2(.x = .,
.y = str_type_vec,
~ brainPrepTibFunc(enet_CFA_fit = enet_CFA_fit,
modality_type = .x,
str_type = .y,
CFA_Resp = "g_second_order"))
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## Warning: Some data not merged properly. Check for naming errors in data:
## atlas type hemi side region label wmean pvalue
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 <NA> <NA> <NA> <NA> <NA> right-cerebellum-cortex 0.0301 0.00120
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
ggpubr::ggarrange(plotlist=purrr::map(plotPreptaskfMRI,"plot_ggsegDes_all")
,nrow=4)
# ggpubr::ggarrange(plotlist=purrr::map(plotPreptaskfMRI,"plot_ggsegAs_all")
# ,nrow=4)
#
modality_type_vec %>% map(.,~gridExtra::grid.arrange(plotPreptaskfMRI[[.]][["plot_ggsegDes_all"]],
plotPreptaskfMRI[[.]][["plot_ggsegAseg_all"]],
nrow = 1, ncol = 2, widths = c(4, 1.5)))
## [[1]]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
##
## [[2]]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
##
## [[3]]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
##
## [[4]]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
library(ggsegJHU)
HaglerToJhu <- read_csv(paste0(utilFold, "HaglerToJhu.csv"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## hemi = col_character(),
## region = col_character(),
## side = col_character(),
## label = col_character(),
## Hagler = col_character()
## )
DTITrib <- extract_tibble(enet_CFA_fit[["DTI"]][["g_second_order"]],
alpha_index = paste0("a", enet_CFA_xplorer[["DTI"]][["g_second_order"]][["param_tune"]][["mixture"]])) %>%
filter(type == "Target") %>%
mutate(.,Hagler = str_replace_all(variable, "FA_", ''))
ggsegJHU::jhu %>% as_tibble %>%
left_join(HaglerToJhu, by = c("hemi","region","side","label")) %>%
left_join(DTITrib %>% select(-type), by = "Hagler") %>%
filter(pvalue <.05) %>%
select( -geometry) %>%
ggseg(atlas = 'jhu',
mapping = aes(fill = wmean),
colour="black") +
theme_void() +
theme(text = element_text(size=30)) +
labs(title = "DTI") +
scale_fill_gradient2(
# limits = c(coefs_ggsegMin, coefs_ggsegMax),
midpoint = 0, low = "blue", mid = "white",
high = "red", space = "Lab", na.value="transparent" ) +
guides(fill = guide_colourbar(barwidth = 1.5, barheight = 9, title = NULL))
## merging atlas and data by 'atlas', 'type', 'hemi', 'region', 'side', 'label'
theme(legend.position = "none")
## List of 1
## $ legend.position: chr "none"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
enet_rf_valid_CFA <- CFA_Resp_Var %>%
map(.,~rnd_for_data_prep(pred_select = "predicted",
data_list = enet_CFA,
resp_chose = .,
resp_data = valid_CFA))
enet_rf_test_baseline_CFA <- CFA_Resp_Var %>%
map(.,~rnd_for_data_prep(pred_select="predicted_baseline",
data_list = enet_CFA,
resp_chose = .,
resp_data = baseline_CFA))
enet_rf_test_followup_CFA <- CFA_Resp_Var %>%
map(.,~rnd_for_data_prep(pred_select="predicted_followup",
data_list = enet_CFA,
resp_chose = .,
resp_data = followup_CFA))
CFA_stack_valid <- CFA_Resp_Var %>%
map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_valid_CFA))%>%
do.call(rbind,.)%>%
mutate(number_valid = datasplit_number)%>%
subset(.,select=-c(datasplit_number))
CFA_stack_baseline <- CFA_Resp_Var %>%
map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_baseline_CFA))%>%
do.call(rbind,.)%>%
mutate(baseline_test = datasplit_number)%>%
subset(.,select=-c(datasplit_number))
CFA_stack_followup <- CFA_Resp_Var %>%
map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_followup_CFA))%>%
do.call(rbind,.)%>%
mutate(followup_test = datasplit_number)%>%
subset(.,select=-c(datasplit_number))
plyr::join_all(list(CFA_stack_valid,CFA_stack_baseline,CFA_stack_followup,CFA_var_plotting),
type="left", by = "response")%>%
select("short_name","number_valid",ends_with("test"))%>%
kableExtra::kbl(caption = paste0("Sample size of all the data sets in stacked models ")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| short_name | number_valid | baseline_test | followup_test |
|---|---|---|---|
| Language | 2740 | 5108 | 5119 |
| Mental Flexibility | 2740 | 5108 | 5119 |
| Memory Recall | 2740 | 5108 | 5119 |
| G-Factor | 2740 | 5108 | 5119 |
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
rf_fit_CFA <- CFA_Resp_Var %>%
future_map(.,~random_forest_tuning(resp=.,
valid_list = enet_rf_valid_CFA,
baseline_test = enet_rf_test_baseline_CFA,
followup_test = enet_rf_test_followup_CFA),
.options = furrr_options(seed=TRUE),
.progress = TRUE)
CFA_rf_results <- random_fores_output_print(rf_output = rf_fit_CFA,
enet_output = enet_CFA,
response_vec=CFA_Resp_Var,
names_plotting = CFA_var_plotting)
CFA_rf_results[["rf_performance"]]%>%
rename("Response Variable" = respones,
"Baseline" = "Baseline (Pearson r)",
"Followup" = "Followup (Pearson r)") %>%
select("Response Variable",
"Baseline","Followup",
"#predictors/split",
"#observations/split") %>%
mutate(across(2:3, round, 3)) %>%
kableExtra::kbl(caption = paste0("Performance (Pearson r) and Hyperparameters of Random Forest Stacking for Factor Scores")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| Response Variable | Baseline | Followup | #predictors/split | #observations/split |
|---|---|---|---|---|
| Language | 0.423 | 0.418 | 4 | 160 |
| Mental Flexibility | 0.314 | 0.305 | 8 | 350 |
| Memory Recall | 0.369 | 0.348 | 4 | 200 |
| G-Factor | 0.434 | 0.414 | 4 | 170 |
CFA_rf_results[["rf_enet_baseline"]] %>%
rename("Response Variable" = short_name,
"NBack-fMRI" = Nback,
"SST-fMRI" = SST,
"MID-fMRI" = MID,
"rs-fMRI" = rsmri,
"sMRI" = smri,
"Stacked" = random_forest_baseline) %>%
select("Response Variable",
"Stacked",
"NBack-fMRI",
"MID-fMRI",
"SST-fMRI",
"rs-fMRI",
"sMRI",
"DTI") %>%
mutate(across(2:8, round, 3)) %>%
kableExtra::kbl(caption = paste0("Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Factor Score\n at the Baseline(Pearson r)")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| Response Variable | Stacked | NBack-fMRI | MID-fMRI | SST-fMRI | rs-fMRI | sMRI | DTI |
|---|---|---|---|---|---|---|---|
| Language | 0.423 | 0.388 | 0.197 | 0.140 | 0.228 | 0.211 | 0.192 |
| Mental Flexibility | 0.314 | 0.313 | 0.144 | 0.063 | 0.133 | 0.132 | 0.144 |
| Memory Recall | 0.369 | 0.315 | 0.169 | 0.082 | 0.182 | 0.179 | 0.166 |
| G-Factor | 0.434 | 0.402 | 0.202 | 0.129 | 0.224 | 0.210 | 0.193 |
CFA_rf_results[["rf_enet_followup"]] %>%
rename("Response Variable" = short_name,
"NBack-fMRI" = Nback,
"SST-fMRI" = SST,
"MID-fMRI" = MID,
"rs-fMRI" = rsmri,
"sMRI" = smri,
"Stacked" = random_forest_followup) %>%
select("Response Variable",
"Stacked",
"NBack-fMRI",
"MID-fMRI",
"SST-fMRI",
"rs-fMRI",
"sMRI",
"DTI") %>%
mutate(across(2:8, round, 3)) %>%
kableExtra::kbl(caption = paste0("Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Factor Score\n at the Follow-Up (Pearson r)")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| Response Variable | Stacked | NBack-fMRI | MID-fMRI | SST-fMRI | rs-fMRI | sMRI | DTI |
|---|---|---|---|---|---|---|---|
| Language | 0.418 | 0.370 | 0.149 | 0.150 | 0.254 | 0.236 | 0.211 |
| Mental Flexibility | 0.305 | 0.292 | 0.090 | 0.074 | 0.166 | 0.156 | 0.173 |
| Memory Recall | 0.348 | 0.319 | 0.123 | 0.130 | 0.206 | 0.189 | 0.188 |
| G-Factor | 0.414 | 0.383 | 0.148 | 0.145 | 0.253 | 0.230 | 0.217 |
enet_rf_valid_CFA_permu <- enet_rf_valid_CFA %>% map(.,function(data_input=.){
data_output<- data_input[["NoID"]] %>%
dplyr::rename("MID_large"="MID_pred.x",
"SST_large"="SST_pred.x",
"smri_large"="smri_pred.x",
"rsmri_large"="rsmri_pred.x",
"DTI_large"="DTI_pred.x",
"Nback_large"="Nback_pred.x",
"MID_small"="MID_pred.y",
"SST_small"="SST_pred.y",
"smri_small"="smri_pred.y",
"rsmri_small"="rsmri_pred.y",
"DTI_small"="DTI_pred.y",
"Nback_small"="Nback_pred.y")
return(data_output)
})
cf_fit_CFA <- CFA_Resp_Var %>% map(.,function(resp_var=.){
cforest(as.formula(paste0(resp_var, "~ .")),
data=enet_rf_valid_CFA_permu[[resp_var]],
control= cforest_unbiased(mtry=rf_fit_CFA[[resp_var]][["param_tune"]]$mtry,
ntree=1000,
minsplit=rf_fit_CFA[[resp_var]][["param_tune"]]$min_n))
})
PI_permimp_CFA <-cf_fit_CFA %>%
map(.,~permimp(.,progressBar = FALSE))
# CFA_Resp_Var %>%
# map(.,~plot(PI_permimp_CFA[[.]], type = "box",
# horizontal = TRUE,
# main=paste0("Conditional Permutation Importance ",
# CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]])))
CFA_Resp_Var %>%
map(.,~plot(PI_permimp_CFA[[.]],
type = "bar",
interval="quantile",
main=paste0("Conditional Permutation Importance ",
CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]])))
## $Lang_first_order
## NULL
##
## $CogFlex_first_order
## NULL
##
## $MemRe_first_order
## NULL
##
## $g_second_order
## NULL
random_forest_plotting_followup(rf_fit =rf_fit_CFA,
followup_data=enet_rf_test_followup_CFA,
naming_frame = CFA_var_plotting,
resp_vec= CFA_Resp_Var)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
random_forest_plotting_baseline(rf_fit =rf_fit_CFA,
baseline_data=enet_rf_test_baseline_CFA,
naming_frame = CFA_var_plotting,
resp_vec= CFA_Resp_Var)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
modality_ploting <- c("Stacked_All","Stacked_Complete","Stacked_Best", "Stacked_NoBest", "NBack","MID", "SST","rs_fMRI","sMRI","DTI")
resp_plotting <- c("g_second_order")
predictedObserved_GfactBaseline <-
baseline_CFA %>% select(SUBJECTKEY, g_second_order) %>%
# enet_rf_test_baseline_CFA$g_second_order$ID %>% #observed:
# select(SUBJECTKEY, g_second_order) %>%
full_join(rf_fit_CFA$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("Stacked_All"= pred_baseline) %>%
full_join(enet_CFA$Nback_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("NBack" = pred) %>%
full_join(enet_CFA$SST_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("SST" = pred) %>%
full_join(enet_CFA$MID_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("MID" = pred) %>%
full_join(enet_CFA$rsmri_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("rs_fMRI" = pred) %>%
full_join(enet_CFA$smri_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("sMRI" = pred) %>%
full_join(enet_CFA$DTI_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("DTI" = pred)
#create a column for list wise deletion and apply z score
predictedObserved_GfactBaselineZ <- predictedObserved_GfactBaseline %>%
na.omit() %>%
select(SUBJECTKEY, "Stacked_All") %>%
rename("Stacked_Complete" = "Stacked_All") %>%
right_join(predictedObserved_GfactBaseline, by = "SUBJECTKEY") %>%
mutate_at(resp_plotting, ~(scale(.) %>% as.vector))
#create 1) stacked bast = a stacked column with the same subjects with nback
#create 2) stacked others = a stacked column without the nback subjects
predictedObserved_GfactBaselineZ <- predictedObserved_GfactBaselineZ %>%
select(SUBJECTKEY, "Stacked_All", "NBack") %>%
drop_na("NBack") %>%
rename("Stacked_Best" = "Stacked_All") %>% select(-NBack) %>%
right_join(predictedObserved_GfactBaselineZ, by = "SUBJECTKEY") %>%
mutate(Stacked_NoBest = Stacked_All)
predictedObserved_GfactBaselineZ[which(!is.na(predictedObserved_GfactBaselineZ$NBack)),
"Stacked_NoBest"] <- NA
predictedObserved_GfactBaselineZ <-
predictedObserved_GfactBaselineZ[, c(1, 4, 5, 3, 2, 12, 6, 7, 8, 9, 10, 11)]
plotGFact_baseline <- list()
for (modalityNum in seq(1,length(modality_ploting),1))
{
plotGFact_baseline[[modalityNum]] <-
ggplot(data = predictedObserved_GfactBaselineZ,
aes_string(x = modality_ploting[modalityNum],
y = 'g_second_order')) +
geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
geom_smooth(method = 'lm', se = FALSE, col = 'black') +
labs(x = NULL,
y = NULL,
title = paste0(modality_ploting[modalityNum],
' r=',round(cor(predictedObserved_GfactBaselineZ[modality_ploting[modalityNum]],
predictedObserved_GfactBaselineZ$g_second_order,
use = "pairwise.complete.obs"), 3))) +
# "\nMAE=", round(
# yardstick::mae_vec(truth = as.numeric(unlist(predictedObserved_GfactBaselineZ[modality_ploting[modalityNum]])),
# predictedObserved_GfactBaselineZ$g_second_order,
# na_rm = TRUE),3))) +
theme(text = element_text(size=16))
}
figure <-
ggpubr::ggarrange(ggpubr::ggarrange(plotlist=plotGFact_baseline[1:4]),
ggpubr::ggarrange(plotlist=plotGFact_baseline[5:10]),
nrow = 2)
ggpubr::annotate_figure(figure,
top = ggpubr::text_grob("Ability to Predict G-Factor at Baseline", face = "bold", size = 29),
bottom = ggpubr::text_grob("Predicted G-Factor (Z)", size = 30),
left = ggpubr::text_grob("Observed G-Factor (Z)", size = 30, rot = 90)
)
naniar::vis_miss(predictedObserved_GfactBaselineZ %>% select(-SUBJECTKEY,-g_second_order)) +
guides(fill = ggplot2::guide_legend(reverse = TRUE)) +
theme(
axis.text.x=element_text(angle =- 90,hjust = 1),
text = element_text(size=18))
colSums(!is.na(predictedObserved_GfactBaselineZ))
## SUBJECTKEY g_second_order Stacked_All Stacked_Complete
## 5487 5487 5171 1164
## Stacked_Best Stacked_NoBest NBack SST
## 2653 2518 2653 2791
## MID rs_fMRI sMRI DTI
## 3138 3839 4567 4071
predictedObserved_Gfactfollowup <-
followup_CFA %>% select(SUBJECTKEY, g_second_order) %>%
# enet_rf_test_baseline_CFA$g_second_order$ID %>% #observed:
# select(SUBJECTKEY, g_second_order) %>%
full_join(rf_fit_CFA$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("Stacked_All"= pred_followup) %>%
full_join(enet_CFA$Nback_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("NBack" = pred) %>%
full_join(enet_CFA$SST_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("SST" = pred) %>%
full_join(enet_CFA$MID_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("MID" = pred) %>%
full_join(enet_CFA$rsmri_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("rs_fMRI" = pred) %>%
full_join(enet_CFA$smri_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("sMRI" = pred) %>%
full_join(enet_CFA$DTI_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("DTI" = pred)
#create a column for list wise deletion and apply z score
predictedObserved_GfactfollowupZ <- predictedObserved_Gfactfollowup %>%
na.omit() %>%
select(SUBJECTKEY, "Stacked_All") %>%
rename("Stacked_Complete" = "Stacked_All") %>%
right_join(predictedObserved_Gfactfollowup, by = "SUBJECTKEY") %>%
mutate_at(resp_plotting, ~(scale(.) %>% as.vector))
#create 1) stacked bast = a stacked column with the same subjects with nback
#create 2) stacked others = a stacked column without the nback subjects
predictedObserved_GfactfollowupZ <- predictedObserved_GfactfollowupZ %>%
select(SUBJECTKEY, "Stacked_All", "NBack") %>%
drop_na("NBack") %>%
rename("Stacked_Best" = "Stacked_All") %>% select(-NBack) %>%
right_join(predictedObserved_GfactfollowupZ, by = "SUBJECTKEY") %>%
mutate(Stacked_NoBest = Stacked_All)
predictedObserved_GfactfollowupZ[which(!is.na(predictedObserved_GfactfollowupZ$NBack)),
"Stacked_NoBest"] <- NA
predictedObserved_GfactfollowupZ <-
predictedObserved_GfactfollowupZ[, c(1, 4, 5, 3, 2, 12, 6, 7, 8, 9, 10, 11)]
plotGFact_followup <- list()
for (modalityNum in seq(1,length(modality_ploting),1))
{
plotGFact_followup[[modalityNum]] <-
ggplot(data = predictedObserved_GfactfollowupZ,
aes_string(x = modality_ploting[modalityNum],
y = 'g_second_order')) +
geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
geom_smooth(method = 'lm', se = FALSE, col = 'black') +
labs(x = NULL,
y = NULL,
title = paste0(modality_ploting[modalityNum],
' r=',round(cor(predictedObserved_GfactfollowupZ[modality_ploting[modalityNum]],
predictedObserved_GfactfollowupZ$g_second_order,
use = "pairwise.complete.obs"), 3)))+
# "\nMAE=", round(
# yardstick::mae_vec(truth = as.numeric(unlist(predictedObserved_GfactfollowupZ[modality_ploting[modalityNum]])),
# predictedObserved_GfactfollowupZ$g_second_order,
# na_rm = TRUE),3))) +
theme(text = element_text(size=16))
}
figure <-
ggpubr::ggarrange(ggpubr::ggarrange(plotlist=plotGFact_followup[1:4]),
ggpubr::ggarrange(plotlist=plotGFact_followup[5:10]),
nrow = 2)
ggpubr::annotate_figure(figure,
top = ggpubr::text_grob("Ability to Predict G-Factor at Follow-up", face = "bold", size = 29),
bottom = ggpubr::text_grob("Predicted G-Factor (Z)", size = 30),
left = ggpubr::text_grob("Observed G-Factor (Z)", size = 30, rot = 90)
)
### number of (non-)missing values
naniar::vis_miss(predictedObserved_GfactfollowupZ %>% select(-SUBJECTKEY,-g_second_order)) +
guides(fill = ggplot2::guide_legend(reverse = TRUE)) +
theme(
axis.text.x=element_text(angle =- 90,hjust = 1),
text = element_text(size=20))
colSums(!is.na(predictedObserved_GfactfollowupZ))
## SUBJECTKEY g_second_order Stacked_All Stacked_Complete
## 5280 5280 5119 1374
## Stacked_Best Stacked_NoBest NBack SST
## 2996 2123 2996 2903
## MID rs_fMRI sMRI DTI
## 3127 4137 4569 4230
get_summary_stats <- function(data_input, target_input){
data_input <- data_input %>%
mutate(pred_target = as.numeric(scale(data_input$g_second_order)) )
mae_perform <- yardstick::mae(data =data_input,
truth=pred_target, estimate=data_input[[target_input]])
rmse_perform <- yardstick::rmse(data =data_input,
truth=pred_target, estimate=data_input[[target_input]])
cor_perform <- cor(data_input$g_second_order,
data_input[[target_input]],
use = "pairwise.complete.obs")
tradrsq_perform <- yardstick::rsq_trad(data = data_input,
truth=pred_target,
estimate=data_input[[target_input]])
return( tibble(Models=target_input,
Pearson_r=round(cor_perform,3),
R_squared=round(tradrsq_perform$.estimate,3),
MAE=round(mae_perform$.estimate,3),
RMSE=round(rmse_perform$.estimate,3)
) )
}
elem_remove <- c("SUBJECTKEY","g_second_order")
baseline_pred_vec <- setdiff(colnames(predictedObserved_GfactBaselineZ),elem_remove)
baseline_perform <- baseline_pred_vec %>%
map(.,~get_summary_stats(data_input = predictedObserved_GfactBaselineZ,
target_input = .))%>%
do.call(rbind,.)%>%
kableExtra::kbl(caption = paste0("Predictive Performance for the baseline test set")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria") %>%
print()
| Models | Pearson_r | R_squared | MAE | RMSE |
|---|---|---|---|---|
| Stacked_All | 0.439 | 0.191 | 0.699 | 0.895 |
| Stacked_Complete | 0.429 | 0.183 | 0.610 | 0.780 |
| Stacked_Best | 0.442 | 0.195 | 0.620 | 0.798 |
| Stacked_NoBest | 0.296 | 0.085 | 0.783 | 0.987 |
| NBack | 0.402 | 0.072 | 0.664 | 0.857 |
| SST | 0.129 | -0.033 | 0.744 | 0.950 |
| MID | 0.202 | 0.013 | 0.738 | 0.944 |
| rs_fMRI | 0.233 | 0.042 | 0.749 | 0.955 |
| sMRI | 0.208 | 0.040 | 0.763 | 0.969 |
| DTI | 0.190 | 0.033 | 0.757 | 0.972 |
followup_pred_vec <- setdiff(colnames(predictedObserved_GfactfollowupZ)
,elem_remove)
followup_perform <- followup_pred_vec %>%
map(.,~get_summary_stats(data_input = predictedObserved_GfactfollowupZ,
target_input = .))%>%
do.call(rbind,.)%>%
kableExtra::kbl(caption = paste0("Predictive Performance for the followup test set")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria") %>%
print()
| Models | Pearson_r | R_squared | MAE | RMSE |
|---|---|---|---|---|
| Stacked_All | 0.414 | 0.166 | 0.719 | 0.913 |
| Stacked_Complete | 0.427 | 0.168 | 0.651 | 0.829 |
| Stacked_Best | 0.438 | 0.175 | 0.666 | 0.846 |
| Stacked_NoBest | 0.317 | 0.100 | 0.794 | 1.000 |
| NBack | 0.383 | 0.118 | 0.687 | 0.875 |
| SST | 0.145 | -0.004 | 0.760 | 0.961 |
| MID | 0.148 | -0.003 | 0.757 | 0.955 |
| rs_fMRI | 0.251 | 0.055 | 0.754 | 0.954 |
| sMRI | 0.226 | 0.049 | 0.764 | 0.965 |
| DTI | 0.212 | 0.045 | 0.771 | 0.978 |
perfDiff = function(data, indices){
sample = data[indices,]
cor_Stack <- cor(sample$g_second_order,
sample$Stacked_Best,
use = "pairwise.complete.obs")
cor_NBack <- cor(sample$g_second_order,
sample$NBack,
use = "pairwise.complete.obs")
tradrsq_Stack <- yardstick::rsq_trad(data =sample,
truth=g_second_order,
estimate=Stacked_Best)
tradrsq_NBack <- yardstick::rsq_trad(data =sample,
truth=g_second_order,
estimate=NBack)
mae_Stack <- yardstick::mae(data =sample,
truth=g_second_order,
estimate=Stacked_Best)
mae_NBack <- yardstick::mae(data =sample,
truth=g_second_order,
estimate=NBack)
rmse_Stack <- yardstick::rmse(data =sample,
truth=g_second_order,
estimate=Stacked_Best)
rmse_NBack <- yardstick::rmse(data =sample,
truth=g_second_order,
estimate=NBack)
cor_StackVSNBack = cor_Stack - cor_NBack
tradrsq_StackVSNBack = tradrsq_Stack$.estimate - tradrsq_NBack$.estimate
mae_StackVSNBack = mae_Stack$.estimate - mae_NBack$.estimate
rmse_StackVSNBack = rmse_Stack$.estimate - rmse_NBack$.estimate
StackVSNBack = c(cor_StackVSNBack, tradrsq_StackVSNBack, mae_StackVSNBack, rmse_StackVSNBack)
return(StackVSNBack)
}
StackBestVSBaseline <- predictedObserved_GfactBaselineZ %>%
select(g_second_order,Stacked_Best,NBack) %>% na.omit()
set.seed(12345)
DiffResultsBaseline <- boot::boot(data = StackBestVSBaseline,
statistic = perfDiff,
R = 5000)
#"Bias Corrected and Accelerated"
ci_r_baseline <- boot::boot.ci(DiffResultsBaseline, index = 1, conf = 0.95, type = 'bca')
ci_rsq_baseline <-boot::boot.ci(DiffResultsBaseline, index = 2, conf = 0.95, type = 'bca')
ci_mae_baseline <- boot::boot.ci(DiffResultsBaseline, index = 3, conf = 0.95, type = 'bca')
ci_rmse_baseline <- boot::boot.ci(DiffResultsBaseline, index = 4, conf = 0.95, type = 'bca')
ci_r_baseline
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca",
## index = 1)
##
## Intervals :
## Level BCa
## 95% ( 0.0206, 0.0585 )
## Calculations and Intervals on Original Scale
ci_rsq_baseline
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca",
## index = 2)
##
## Intervals :
## Level BCa
## 95% ( 0.0965, 0.1491 )
## Calculations and Intervals on Original Scale
ci_mae_baseline
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca",
## index = 3)
##
## Intervals :
## Level BCa
## 95% (-0.0565, -0.0320 )
## Calculations and Intervals on Original Scale
ci_rmse_baseline
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca",
## index = 4)
##
## Intervals :
## Level BCa
## 95% (-0.0710, -0.0464 )
## Calculations and Intervals on Original Scale
StackBestVSfollowup <- predictedObserved_GfactfollowupZ %>%
select(g_second_order,Stacked_Best,NBack) %>% na.omit()
set.seed(12345)
DiffResultsfollowup <- boot::boot(data = StackBestVSfollowup,
statistic = perfDiff,
R = 5000)
#"Bias Corrected and Accelerated"
ci_r_followup <- boot::boot.ci(DiffResultsfollowup, index = 1, conf = 0.95, type = 'bca')
ci_rsq_followup <-boot::boot.ci(DiffResultsfollowup, index = 2, conf = 0.95, type = 'bca')
ci_mae_followup <- boot::boot.ci(DiffResultsfollowup, index = 3, conf = 0.95, type = 'bca')
ci_rmse_followup <- boot::boot.ci(DiffResultsfollowup, index = 4, conf = 0.95, type = 'bca')
ci_r_followup
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca",
## index = 1)
##
## Intervals :
## Level BCa
## 95% ( 0.0359, 0.0732 )
## Calculations and Intervals on Original Scale
ci_rsq_followup
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca",
## index = 2)
##
## Intervals :
## Level BCa
## 95% ( 0.0322, 0.0812 )
## Calculations and Intervals on Original Scale
ci_mae_followup
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca",
## index = 3)
##
## Intervals :
## Level BCa
## 95% (-0.0331, -0.0096 )
## Calculations and Intervals on Original Scale
ci_rmse_followup
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca",
## index = 4)
##
## Intervals :
## Level BCa
## 95% (-0.0411, -0.0164 )
## Calculations and Intervals on Original Scale
r_plot_baseline <- as_tibble(DiffResultsBaseline$t[,1]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("Pearson's r") +
geom_vline(aes(xintercept=ci_r_baseline$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_r_baseline$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
rsq_plot_baseline <- as_tibble(DiffResultsBaseline$t[,2]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("R squared") +
geom_vline(aes(xintercept=ci_rsq_baseline$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_rsq_baseline$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
mae_plot_baseline <- as_tibble(DiffResultsBaseline$t[,3]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("MAE") +
geom_vline(aes(xintercept=ci_mae_baseline$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_mae_baseline$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
rmse_plot_baseline <- as_tibble(DiffResultsBaseline$t[,4]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("RMSE") +
geom_vline(aes(xintercept=ci_rmse_baseline$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_rmse_baseline$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
boot_baseline <-
ggpubr::ggarrange(plotlist=
list(r_plot_baseline,
rsq_plot_baseline,
mae_plot_baseline,
rmse_plot_baseline)) %>%
ggpubr::annotate_figure(top = ggpubr::text_grob("Baseline Test Set",
size=22,
face = "bold"))
r_plot_followup <- as_tibble(DiffResultsfollowup$t[,1]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("Pearson's r") +
geom_vline(aes(xintercept=ci_r_followup$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_r_followup$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
rsq_plot_followup <- as_tibble(DiffResultsfollowup$t[,2]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("R squared") +
geom_vline(aes(xintercept=ci_rsq_followup$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_rsq_followup$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
mae_plot_followup <- as_tibble(DiffResultsfollowup$t[,3]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("MAE") +
geom_vline(aes(xintercept=ci_mae_followup$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_mae_followup$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
rmse_plot_followup <- as_tibble(DiffResultsfollowup$t[,4]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("RMSE") +
geom_vline(aes(xintercept=ci_rmse_followup$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_rmse_followup$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
boot_followup <- ggpubr::ggarrange(plotlist=
list(r_plot_followup, rsq_plot_followup, mae_plot_followup, rmse_plot_followup)) %>%
ggpubr::annotate_figure(top = ggpubr::text_grob("Follow-up Test Set",
size=22,
face = "bold"))
ggpubr::ggarrange(boot_baseline, boot_followup,
nrow=2) %>%
ggpubr::annotate_figure(top =
ggpubr::text_grob("Bootstrapped Distribution Stacked Best > N-Back",
size=24,
face = "bold"))
To test if the model generalise across sites, we used the baseline data, fit the model to all but one site and test the results at hold-out site.
After leaving one site as a test, we 50/50 divided the rest into train (i.e,. 1st layer) and validate (i.e., 2nd layer). This process then was repeated until we finished all of held-out sites.
baseline_site <- baseline_analysis %>% drop_na(SITE_ID_L)
unique_site <- length(unique(baseline_site$SITE_ID_L))
site_vfold <- group_vfold_cv(data = baseline_site, group = "SITE_ID_L", v= unique_site)
## the following two lines take the split results and transform it into a data frame
## so that all the results can be checked.
#model_data_1 <- site_vfold$splits[[1]]%>% analysis()
#holdout__data_1 <- site_vfold$splits[[1]] %>% assessment()
all_sample <- site_vfold$id
all_site <- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) %>%
do.call(rbind,.)
all_site <- as.vector(all_site)
all_site_number <- str_remove_all(all_site,"site")
###divide the sampled data into train valid and test sets
#after leave one site, the rest was devided into 2: train and validate
train_valid_split <- all_sample %>%
map(.,~vfold_cv(data=analysis(site_vfold$splits[[which(site_vfold$id==.)]]),
v=2,repeats = 1))
names(train_valid_split)<- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
test_site_split <- all_sample %>%
map(.,~assessment(site_vfold$splits[[which(site_vfold$id==.)]]))
names(test_site_split)<- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
Note, for cross site validation, we only focus on the baseline data. We didn’t use the followup data here.
enet_site_nocom <-
function(resp_var,
split_train = analysis( data_two_fold$splits[[1]]),
split_valid = assessment(data_two_fold$splits[[1]]),
split_test=data_fold_test,
contrast_name){
data_train <- split_train%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name))%>%
drop_na()%>% IQR_remove()
data_valid <- split_valid%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name))%>%
drop_na()%>% IQR_remove()
test_set <- split_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name))%>%
drop_na()%>% IQR_remove()
ols_data_train <-
select(data_train,
resp_var,
starts_with(contrast_name))
ols_data_valid <-
select(data_valid,
resp_var,
starts_with(contrast_name))
ols_test_set <- select(test_set,
resp_var,
starts_with(contrast_name))
return(list(ols_data_train=ols_data_train,
ols_data_valid=ols_data_valid,
ols_test_set=ols_test_set,
data_valid_id=data_valid$SUBJECTKEY,
test_set_id=test_set$SUBJECTKEY,
data_train_id=data_train$SUBJECTKEY))
}
enet_site_com <- function(resp_var,
split_train = analysis( data_two_fold$splits[[1]]),
split_valid = assessment(data_two_fold$splits[[1]]),
split_test=data_fold_test,
measure_one,
measure_two){
data_train <-split_train%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two))%>%
drop_na()%>% IQR_remove()
data_valid <- split_valid%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two))%>%
drop_na()%>% IQR_remove()
test_set <- split_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two))%>%
drop_na()%>% IQR_remove()
ols_data_train <- batch_adjust(data_fold = data_train,
resp = resp_var,
x=measure_one,
y=measure_two)
ols_data_valid <- batch_adjust(data_fold = data_valid,
resp = resp_var,
x=measure_one,
y= measure_two)
ols_test_set <- select(test_set,
resp_var,
starts_with(measure_one),
starts_with(measure_two) )
return(list(ols_data_train=ols_data_train,
ols_data_valid=ols_data_valid,
ols_test_set=ols_test_set,
data_valid_id=data_valid$SUBJECTKEY,
test_set_id=test_set$SUBJECTKEY,
data_train_id=data_train$SUBJECTKEY))
}
# for models that require Combat that has one type of data. so far this was used for DTI only. This is bc DTI only one prefix ("fa")
enet_site_comOne <- function(resp_var,
split_train = analysis( data_two_fold$splits[[1]]),
split_valid = assessment(data_two_fold$splits[[1]]),
split_test=data_fold_test, measure_one){
data_train <-split_train%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one))%>%
drop_na()%>%
IQR_remove()
data_valid <- split_valid%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one))%>%
drop_na()%>%
IQR_remove()
test_set <- split_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one))%>%
drop_na()%>% IQR_remove()
ols_data_train <- batch_adjustOne(data_fold = data_train,
resp = resp_var, x=measure_one)
ols_data_valid <- batch_adjustOne(data_fold = data_valid,
resp = resp_var, x=measure_one)
ols_test_set <- select(test_set,
resp_var,
starts_with(measure_one))
return(list(ols_data_train=ols_data_train,
ols_data_valid=ols_data_valid,
ols_test_set=ols_test_set,
data_valid_id=data_valid$SUBJECTKEY,
test_set_id=test_set$SUBJECTKEY,
data_train_id=data_train$SUBJECTKEY))
}
enet_tuning_site <-
function(resp_var, data_list){
#recipe train and test data are from ols as the variables are the same
norm_recipe <-recipe(as.formula(paste0(resp_var, "~ .")),
data =data_list[["ols_data_train"]] ) %>%
step_dummy(all_nominal()) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
# estimate the means and standard deviations
prep(training = data_list[["ols_data_train"]], retain = TRUE)
## model
glmn_mod <- linear_reg(penalty = tune() ,mixture =tune()) %>%
set_engine("glmnet")
set.seed(123456)
all_rs <- rsample::vfold_cv(data=data_list[["ols_data_train"]],v = 10,repeats = 1)
glmn_wfl <- workflow()%>%
add_recipe(norm_recipe)%>%
add_model(glmn_mod)
## penalty is lambda, mixture is alpha
## log10 transformed to get lambda between 10^-10 to 10^1
## this expands the default values (10^-10 to 10^0) from tidymodels https://github.com/tidymodels/dials/issues/140
glmn_set <- dials::parameters(penalty(range = c(-10,1), trans = log10_trans()),mixture())
## 100 levels of lambda and 11 levels of alpha (0, .1 to 1) # use 100 level as opposed to 200 to reduce the use ofram
glmn_grid <- grid_regular(glmn_set, levels = c(100,11))
ctrl <- control_grid(save_pred = TRUE, verbose = TRUE)
glmn_tune <-tune::tune_grid(glmn_wfl,
resamples = all_rs,
grid = glmn_grid,
metrics = metric_set(mae),##use traditional r square because there is no correlation
control = ctrl)
##select the best tuned lambda and alpha
best_glmn <- select_best(glmn_tune, metric = "mae")
## use the best tunes parameters to fit the test data
glmn_wfl_final <-
glmn_wfl %>%
finalize_workflow(best_glmn) %>%
parsnip::fit(data = data_list[["ols_data_train"]])
### this is the prediction for random forest tuning
pred_glmn <- predict(glmn_wfl_final, new_data =data_list[["ols_data_valid"]])
###this is the prediction for test set
pred_test <- predict(glmn_wfl_final, new_data =data_list[["ols_test_set"]])
##compute the correlation between predicted results and the test data
performance_glmn <- cor(pred_glmn$.pred,data_list[["ols_data_valid"]][[resp_var]])
performance_test <- cor(pred_test$.pred,data_list[["ols_test_set"]][[resp_var]])
best_glmn <- best_glmn %>% mutate(performance=performance_glmn,test_performance = performance_test)
return(list("param_tune"=best_glmn,
"predicted"=tibble(SUBJECTKEY=data_list[["data_valid_id"]],
"pred"=pred_glmn$.pred),
"predicted_test"=tibble(SUBJECTKEY=data_list[["test_set_id"]],
"pred"=pred_test$.pred),
"obs_train"=length(data_list[["data_train_id"]]),
"obs_valid"=length(data_list[["data_valid_id"]])))
}
rnd_for_data_site = function(pred_select = "predicted",
data_list,site_idx,
resp_chose,
resp_data){
enet_pred <-
plyr::join_all(list(data_list[["MID_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
dplyr::rename("MID_pred"="pred"),
data_list[["smri_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
dplyr::rename("smri_pred"="pred"),
data_list[["SST_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
dplyr::rename("SST_pred"="pred"),
data_list[["rsmri_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
dplyr::rename("rsmri_pred"="pred"),
data_list[["Nback_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
dplyr::rename("Nback_pred"="pred"),
data_list[["DTI_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
dplyr::rename("DTI_pred"="pred")),
by='SUBJECTKEY', type='full') %>%
distinct(SUBJECTKEY, .keep_all = TRUE)
enet_pred_p <- enet_pred%>% mutate_all(~replace(., is.na(.), 1000))
enet_pred_n <- enet_pred%>% mutate_all(~replace(., is.na(.), -1000))
enet_pred_noNA <- left_join(enet_pred_p,
enet_pred_n,
by="SUBJECTKEY")
resp_data <- resp_data %>%
select("SUBJECTKEY",resp_chose)%>%
distinct(SUBJECTKEY, .keep_all = TRUE) %>%
drop_na()%>%
mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x))
enet_pred_noNA <- left_join(enet_pred_noNA,resp_data,by="SUBJECTKEY")
enet_pred_noNA_noID <- select(enet_pred_noNA,-SUBJECTKEY)
return(list("ID"=enet_pred_noNA,"NoID"=enet_pred_noNA_noID))
}
random_forest_site <- function(resp,valid_list,test_list,site_chose){
### to avoid repetition of coding a list is created with one data frame with ID and another without
rf_train_data <- valid_list[[resp]][[site_chose]]
rf_test_data <- test_list[[resp]][[site_chose]]
resp_var <- resp
##data recipe, assign "SUBJECTKEY" to ID (a class of variables neither belong to predictors nor response variable)
## do not scale the input data as 1000 and -1000 are put there on purpose
forest_rec <- recipe(as.formula(paste0(resp_var, "~ .")), data = rf_train_data[["NoID"]]) %>%
step_dummy(all_nominal()) %>%
prep(training = rf_train_data[["NoID"]], retain = TRUE)
## mtry is the number of predictors to sample at each split
## min_n (the number of observations needed to keep splitting nodes)
tune_spec <- rand_forest(
mtry = tune(),
trees = 1000,
min_n = tune()
) %>%
set_mode("regression") %>%
set_engine("ranger")
tune_wf <- workflow() %>%
add_recipe(forest_rec) %>%
add_model(tune_spec)
set.seed(123456)
## nested data validation
all_rs <- rsample::vfold_cv(data=rf_train_data[["ID"]],v = 10,repeats = 1)
## automate generate grid for hyperparameters
rf_grid <- grid_regular(
mtry(range = c(1, 12)), #12 is the highest number of predictors
min_n(range = c(1, 500)), #500 is enough based on earlier analyses
levels = c(mtry = 12, min_n = 51) #mtry = 10 so that it increases one step at a time
)
## dofuture package is imported by package furrr
doParallel::registerDoParallel()
future::plan(multisession,workers= all_cores)
tune_res <- tune_grid(
tune_wf,
resamples = all_rs,
metrics = metric_set(rmse),
grid = rf_grid
)
best_tune <- select_best(tune_res, metric = "rmse")
final_rf <- finalize_model(
tune_spec,
best_tune
)
forest_prep <- prep(forest_rec)
## this one fit the random forest model with the best tuned parameters
rf_fit <- final_rf %>%
set_engine("ranger") %>%
parsnip::fit(as.formula(paste0(resp_var, "~ .")),
data = juice(forest_prep)
)
pred_rf <- predict(rf_fit,new_data = rf_test_data[["NoID"]])
performance_rf <- cor(pred_rf$.pred,rf_test_data[["ID"]][[resp]])
best_tune <- best_tune %>% mutate(performance=performance_rf)
result_list <- list("param_tune"=best_tune, "predicted"=tibble(SUBJECTKEY=rf_test_data[["ID"]]$SUBJECTKEY, "pred"=pred_rf$.pred),"model_fit"=rf_fit)
return(result_list)
}
change_modality_names <- function(performance_frame){
performance_frame$modality[which(performance_frame$modality=="rsmri")]<- "rs_fMRI"
performance_frame$modality[which(performance_frame$modality=="smri")]<- "sMRI"
performance_frame$modality[which(performance_frame$modality=="random_forest")]<- "Stacked"
performance_frame$modality[which(performance_frame$modality=="Nback")]<- "NBack"
performance_frame <- performance_frame%>%
mutate(modality= as.factor(modality))%>%
mutate(modality = factor(modality,levels =c ("Stacked","NBack","MID", "SST","rs_fMRI","sMRI","DTI")))
}
CFA_lav_response_site <- function(data_input = baseline_two_fold$splits[[1]], test_baseline =baseline_test){
## training the model and for enet parameter tuning
data_train <- analysis(data_input)%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## random forest tuning
data_valid <- assessment(data_input)%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## test set for baseline observations
data_baseline <- test_baseline%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
### first order CFA model
NeuroCog1stOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'
NeuroCog1stOrder.Fit <- lavaan::cfa(model = NeuroCog1stOrder, data = data_train,estimator="MLR")
### second order CFA model
NeuroCog2ndOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC
g =~ NA*Language + CognitiveFlexibity + MemoryRecall #estimate the loading of GenAbi -> as opposed to using it as a marker
g ~~ 1*g #need to constrain variance to 1'
NeuroCog2ndOrder.Fit <- lavaan::cfa(model = NeuroCog2ndOrder, data = data_train,estimator="MLR")
train_CFA_lav <- CFA_lav_output_processing(data_input = data_train,CFA_model1 = NeuroCog1stOrder.Fit,CFA_model2 = NeuroCog2ndOrder.Fit)
valid_CFA_lav <- CFA_lav_output_processing(data_input = data_valid,CFA_model1 = NeuroCog1stOrder.Fit,CFA_model2 = NeuroCog2ndOrder.Fit)
baseline_CFA_lav <- CFA_lav_output_processing(data_input = data_baseline,CFA_model1 = NeuroCog1stOrder.Fit,CFA_model2 = NeuroCog2ndOrder.Fit)
output_list <- list(enet_tuning = train_CFA_lav, RanFor_tuning = valid_CFA_lav, testing_baseline = baseline_CFA_lav)
return(list(output_list, NeuroCog1stOrder.Fit, NeuroCog2ndOrder.Fit))
}
CFA_list_site <- all_site %>% map(.,~CFA_lav_response_site(data_input = train_valid_split[[.]][["splits"]][[1]], test_baseline =test_site_split[[.]]))
names(CFA_list_site) <- all_site
CFA_resp_site <- c("g_second_order" )
Nback_cross_site_CFA <- function(resp_chosen){
# doParallel::registerDoParallel()
### map over all sites instead of one single response variable
Nback_site_data <- all_site %>%
map(.,~enet_site_nocom(resp_var = resp_chosen,
split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["enet_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["RanFor_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_test = left_join(test_site_split[[.]],
CFA_list_site[[.]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
contrast_name = "X2backvs0back_ROI_"))
names(Nback_site_data) <- all_sample %>%
future_map (.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L),
.options = furrr_options(seed=TRUE),
.progress = TRUE)
Nback_site <- all_site %>%
map(.,~enet_tuning_site(resp_var =resp_chosen,
data_list = Nback_site_data[[.]]))
names(Nback_site)<- all_sample %>%
map (.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
return(Nback_site)
}
SST_cross_site_CFA <- function(resp_chosen){
# doParallel::registerDoParallel()
### map over all sites instead of one single response variable
SST_site_data <- all_site %>%
map(.,~enet_site_nocom(resp_var = resp_chosen,
split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["enet_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["RanFor_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_test = left_join(test_site_split[[.]],
CFA_list_site[[.]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
contrast_name = "anystopvscorrectgo_ROI_"))
names(SST_site_data) <- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
SST_site <- all_site %>%
map(.,~enet_tuning_site(resp_var =resp_chosen,
data_list = SST_site_data[[.]]))
names(SST_site)<- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
return(SST_site)
}
MID_cross_site_CFA <- function(resp_chosen){
# doParallel::registerDoParallel()
### map over all sites instead of one single response variable
MID_site_data <- all_site %>%
map(.,~enet_site_nocom(resp_var = resp_chosen,
split_train =left_join(analysis(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["enet_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["RanFor_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_test = left_join(test_site_split[[.]],
CFA_list_site[[.]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
contrast_name = "antiRewVsNeu_ROI_"))
names(MID_site_data)<- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
MID_site <- all_site %>%
map(.,~enet_tuning_site(resp_var =resp_chosen,
data_list = MID_site_data[[.]]))
names(MID_site) <- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
return(MID_site)
}
smri_cross_site_CFA <- function(resp_chosen){
# doParallel::registerDoParallel()
### map over all sites instead of one single response variable
smri_site_data <- all_site %>%
map(.,~enet_site_com(resp_var = resp_chosen,
split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["enet_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["RanFor_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_test = left_join(test_site_split[[.]],
CFA_list_site[[.]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
measure_one = "ASEG_",
measure_two = "Dest_Thick_"))
names(smri_site_data)<- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
smri_site <- all_site %>%
map(.,~enet_tuning_site(resp_var =resp_chosen,
data_list = smri_site_data[[.]]))
names(smri_site)<- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
return(smri_site)
}
rsmri_cross_site_CFA <- function(resp_chosen){
# doParallel::registerDoParallel()
### map over all sites instead of one single response variable
rsmri_site_data <- all_site %>%
map(.,~enet_site_com(resp_var = resp_chosen,
split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["enet_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["RanFor_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_test = left_join(test_site_split[[.]],
CFA_list_site[[.]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
measure_one = "par",
measure_two = "Within"))
names(rsmri_site_data) <- all_sample %>%
map (.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
rsmri_site <- all_site %>%
map(.,~enet_tuning_site(resp_var =resp_chosen,
data_list = rsmri_site_data[[.]]))
names(rsmri_site) <- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
return(rsmri_site)
}
DTI_cross_site_CFA <- function(resp_chosen){
# doParallel::registerDoParallel()
### map over all sites instead of one single response variable
DTI_site_data <- all_site %>%
map(.,~enet_site_comOne(resp_var = resp_chosen,
split_train =
left_join(analysis(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["enet_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["RanFor_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_test = left_join(test_site_split[[.]],
CFA_list_site[[.]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
measure_one = "FA_"))
names(DTI_site_data) <- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
DTI_site <- all_site %>%
map(.,~enet_tuning_site(resp_var =resp_chosen,
data_list = DTI_site_data[[.]]))
names(DTI_site)<- all_sample %>% map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
return(DTI_site)
}
Nback_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~Nback_cross_site_CFA(resp_chosen =.))
SST_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~SST_cross_site_CFA(resp_chosen =.))
MID_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~MID_cross_site_CFA(resp_chosen =.))
smri_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~smri_cross_site_CFA(resp_chosen =.))
rsmri_enet_cross_site_CFA <- CFA_resp_site%>% map(.,~rsmri_cross_site_CFA(resp_chosen =.))
DTI_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~DTI_cross_site_CFA(resp_chosen =.))
rf_enet_cross_site_CFA <- list("Nback_site"=Nback_enet_cross_site_CFA,
"SST_site"=SST_enet_cross_site_CFA,
"MID_site"=MID_enet_cross_site_CFA,
"smri_site"=smri_enet_cross_site_CFA,
"rsmri_site"=rsmri_enet_cross_site_CFA,
"DTI_site"=DTI_enet_cross_site_CFA)
all_cores <- parallel::detectCores(logical = FALSE) - 2
mri_site <- paste0(mri_vec,"_site")
for(i in 1:length(mri_site)){
names(rf_enet_cross_site_CFA[[mri_site[i]]])<- CFA_resp_site
}
random_forest_validsite_CFA <- CFA_resp_site %>% map(.,function(resp_idx=.){
random_forest_onesite <- all_site %>%map(.,
~rnd_for_data_site(pred_select = "predicted",
data_list=rf_enet_cross_site_CFA,site_idx = .,
resp_chose=resp_idx,
resp_data=CFA_list_site[[.]][[1]][["RanFor_tuning"]]) )
names(random_forest_onesite) <- all_site
return(random_forest_onesite)
})
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(resp_chose)` instead of `resp_chose` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
names(random_forest_validsite_CFA) <- CFA_resp_site
random_forest_testsite_CFA <- CFA_resp_site %>% map(.,function(resp_idx=.){
random_forest_onesite <- all_site %>%
map(.,~rnd_for_data_site(pred_select = "predicted_test",
data_list=rf_enet_cross_site_CFA,
site_idx = .,
resp_chose=resp_idx,
resp_data=CFA_list_site[[.]][[1]][["testing_baseline"]]))
names(random_forest_onesite) <- all_site
return(random_forest_onesite)
})
names(random_forest_testsite_CFA) <- CFA_resp_site
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
### this computation takes more than 10 cores and around 50 gb ram
random_forest_crosssite_CFA <- CFA_resp_site %>% future_map(.,function(resp_idx=.){
random_forest_oneresp <-all_site %>% future_map(.,~random_forest_site(resp = resp_idx,
valid_list = random_forest_validsite_CFA,
test_list = random_forest_testsite_CFA,site_chose = .),
.options = furrr_options(seed=TRUE))
names(random_forest_oneresp) <- all_site
return(random_forest_oneresp)
},.options = furrr_options(seed=TRUE))
### enet
modality_site_CFA <-names(rf_enet_cross_site_CFA) %>% str_remove_all("_site")
names(rf_enet_cross_site_CFA)<- modality_site_CFA
cross_site_enet_performance_CFA = function(resp_vec,data_input,modality_input,target_data){
data_split <- data_input[[modality_input]]
one_modality_extract <- resp_vec %>%
map(.,function(resp_input=.){
performance_extract <- all_site %>%
map(.,function(site_idx){
performance_site <-
data_split[[resp_input]][[site_idx]][["param_tune"]][["test_performance"]]
mae_frame <- left_join(data_split[[resp_input]][[site_idx]][["predicted_test"]],
target_data[[site_idx]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
select("pred",all_of(resp_input) ,"SUBJECTKEY") %>%
distinct(SUBJECTKEY, .keep_all = TRUE)%>% drop_na()
mae_frame <- mae_frame %>% mutate(scaled_target = as.vector(scale(mae_frame[[resp_input]])) )
## the data has to be referenced like this to be put into this function
mae_perform <- yardstick::mae(data = mae_frame,
truth=scaled_target,
estimate=pred)
rmse_perform <- yardstick::rmse(data = mae_frame,
truth=scaled_target,
estimate=pred)
tradRsq_perform <- yardstick::rsq_trad(data = mae_frame,
truth=scaled_target,
estimate=pred)
combined_results <- tibble(performance = as.numeric(performance_site),
site=site_idx,
modality=modality_input,
response=resp_input,
mae_performance=mae_perform$.estimate,
rmse_performance=rmse_perform$.estimate,
tradRsq_performance=tradRsq_perform$.estimate
)
return(combined_results)
})%>%
bind_rows()
})
}
performance_enet_cross_site_CFA <- modality_vec %>%
map(.,~cross_site_enet_performance_CFA(resp_vec = CFA_resp_site,
data_input = rf_enet_cross_site_CFA,
modality_input = .,target_data =CFA_list_site))
names(performance_enet_cross_site_CFA) <- modality_vec
for(i in 1: length(modality_vec)){
names(performance_enet_cross_site_CFA[[modality_vec[i]]])<- CFA_resp_site
}
response_enet_cross_site_CFA <- CFA_resp_site %>%
map(.,function(resp_input=.,
data_input=performance_enet_cross_site_CFA){
one_resp_performance <- modality_site_CFA %>%
map(.,function(modality_input=.){
one_resp_modality <- data_input[[modality_input]][[resp_input]]
return(one_resp_modality)
})%>% bind_rows()
one_resp_performance_test<- one_resp_performance
return(one_resp_performance_test)
})
names(response_enet_cross_site_CFA) <- CFA_resp_site
### random forest
names(random_forest_crosssite_CFA)<- CFA_resp_site
cross_site_rf_performance_CFA = function(resp_idx,data_input=random_forest_crosssite,
plotting_frame=resp_var_plotting,target_data=test_site_split){
site_performance <- all_site %>% map(.,function(site_idx){
one_site <- data_input[[resp_idx]][[site_idx]][["param_tune"]][["performance"]]
mae_frame <- left_join(data_input[[resp_idx]][[site_idx]][["predicted"]],
target_data[[site_idx]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>% select("pred",all_of(resp_idx) ,"SUBJECTKEY") %>%
distinct(SUBJECTKEY, .keep_all = TRUE)%>% drop_na()
mae_frame <- mae_frame %>% mutate(scaled_target = as.vector(scale(mae_frame[[resp_idx]])) )
## the data has to be referenced like this to be put into this function
mae_perform <- yardstick::mae(data = mae_frame,truth=scaled_target,estimate=pred)
rmse_perform <- yardstick::rmse(data = mae_frame,truth=scaled_target,estimate=pred)
tradRsq_perform <- yardstick::rsq_trad(data = mae_frame,truth=scaled_target,estimate=pred)
combined_results <- tibble(performance = as.numeric(one_site),
site=site_idx,
modality="random_forest",
response=resp_idx,
longer_name = plotting_frame$longer_name[which(plotting_frame$response==response)],
shorter_name = plotting_frame$short_name[which(plotting_frame$response==response[1])],
mae_performance=mae_perform$.estimate,
rmse_performance=rmse_perform$.estimate,
tradRsq_performance=tradRsq_perform$.estimate)
return(combined_results)
})%>% bind_rows()
}
performance_rf_cross_site_CFA <- CFA_resp_site %>%
map(.,~cross_site_rf_performance_CFA(resp_idx=.,
data_input = random_forest_crosssite_CFA,
plotting_frame = CFA_var_plotting,
target_data =CFA_list_site))
names(performance_rf_cross_site_CFA)<- CFA_resp_site
performance_all_modalities_CFA <- CFA_resp_site %>%
map(.,~bind_rows(performance_rf_cross_site_CFA[[.]],
response_enet_cross_site_CFA[[.]])%>%
mutate(site_num=str_remove(.[['site']],"site") ) )
names(performance_all_modalities_CFA)<- CFA_resp_site
#modality_ploting <- c("Stacked","NBack","MID", "SST","rs_fMRI","sMRI","DTI")
performance_all_modalities_CFA <- performance_all_modalities_CFA %>%
map(.,~change_modality_names(performance_frame = .))
CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = 'Site',
y=NULL,
col = 'Model',
title = paste0("Out-of-site Predictive Ability (Pearson's r)\n",
CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
theme(axis.text.x=element_text(size=16, angle = 45),
axis.text.y=element_text(size=16),
axis.title.y=element_text(size=18),
axis.title.x=element_text(size=18),
plot.title=element_text(size=20),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
[[1]]
## Warning: Removed 3 rows containing missing values.
CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance_all_modalities_CFA [[.]]$mae_performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = 'Site',
y=NULL,
col = 'Model',
title = paste0("Out-of-site Predictive Ability (MAE)\n",
CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
theme(axis.text.x=element_text(size=16, angle = 45),
axis.text.y=element_text(size=16),
axis.title.y=element_text(size=18),
axis.title.x=element_text(size=18),
plot.title=element_text(size=20),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
[[1]]
## Warning: Removed 3 rows containing missing values.
CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance_all_modalities_CFA [[.]]$rmse_performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = 'Site',
y=NULL,
col = 'Model',
title = paste0("Out-of-site Predictive Ability (RMSE)\n",
CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
theme(axis.text.x=element_text(size=16, angle = 45),
axis.text.y=element_text(size=16),
axis.title.y=element_text(size=18),
axis.title.x=element_text(size=18),
plot.title=element_text(size=20),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
[[1]]
## Warning: Removed 3 rows containing missing values.
CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance_all_modalities_CFA [[.]]$tradRsq_performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = 'Site',
y=NULL,
col = 'Model',
title = paste0("Out-of-site Predictive Ability (R squared)\n",
CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
theme(axis.text.x=element_text(size=16, angle = 45),
axis.text.y=element_text(size=16),
axis.title.y=element_text(size=18),
axis.title.x=element_text(size=18),
plot.title=element_text(size=20),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
[[1]]
## Warning: Removed 3 rows containing missing values.
### getting summary statistics
performance_all_modalities_CFA <- performance_all_modalities_CFA %>% map(.,~mutate(.,Pearson_r =performance)%>%
mutate(.,R_squared = tradRsq_performance)%>%
mutate(.,MAE=mae_performance)%>%
mutate(.,RMSE=rmse_performance))
performance_all_modalities_CFA %>% map(.,~group_by(.,modality)%>%
summarise_at(c("Pearson_r","R_squared","MAE","RMSE"),
mean,na.rm=TRUE)%>%
mutate_at(vars(c("Pearson_r","R_squared","MAE","RMSE")),funs(round(.,3)))%>%
kableExtra::kbl(caption = paste0("Mean of all modalities cross sites ",
paste0("\n",resp_var_plotting$longer_name[which(resp_var_plotting$response==.)]))) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
)
$g_second_order
| modality | Pearson_r | R_squared | MAE | RMSE |
|---|---|---|---|---|
| Stacked | 0.460 | 0.210 | 0.698 | 0.888 |
| NBack | 0.408 | 0.167 | 0.718 | 0.910 |
| MID | 0.227 | 0.050 | 0.772 | 0.973 |
| SST | 0.139 | 0.019 | 0.783 | 0.988 |
| rs_fMRI | 0.255 | 0.064 | 0.765 | 0.966 |
| sMRI | 0.248 | 0.061 | 0.763 | 0.967 |
| DTI | 0.223 | 0.049 | 0.766 | 0.974 |
performance_all_modalities_CFA %>% map(.,~group_by(.,modality)%>%
summarise_at(c("Pearson_r","R_squared","MAE","RMSE"),
sd,na.rm=TRUE)%>%
mutate_at(vars(c("Pearson_r","R_squared","MAE","RMSE")),funs(round(.,3)))%>%
kableExtra::kbl(caption = paste0("Standard deviation of all modalities cross sites ",
paste0("\n",resp_var_plotting$longer_name[which(resp_var_plotting$response==.)]))) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
)
$g_second_order
| modality | Pearson_r | R_squared | MAE | RMSE |
|---|---|---|---|---|
| Stacked | 0.057 | 0.052 | 0.023 | 0.029 |
| NBack | 0.069 | 0.055 | 0.028 | 0.031 |
| MID | 0.096 | 0.050 | 0.021 | 0.025 |
| SST | 0.071 | 0.024 | 0.014 | 0.012 |
| rs_fMRI | 0.061 | 0.030 | 0.015 | 0.016 |
| sMRI | 0.092 | 0.046 | 0.024 | 0.024 |
| DTI | 0.076 | 0.037 | 0.016 | 0.019 |
corr_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = NULL,
y=NULL,
col = 'Model',
title = paste0("Pearson's r"))+
theme(axis.text.x=element_text(size=14, angle = 45),
axis.text.y=element_text(size=14),
axis.title.y=element_text(size=16),
axis.title.x=element_text(size=16),
plot.title=element_text(size=18),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
mae_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance_all_modalities_CFA [[.]]$mae_performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = NULL,
y=NULL,
col = 'Model',
title = paste0("MAE"))+
theme(axis.text.x=element_text(size=14, angle = 45),
axis.text.y=element_text(size=14),
axis.title.y=element_text(size=16),
axis.title.x=element_text(size=16),
plot.title=element_text(size=18),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
rmse_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance_all_modalities_CFA [[.]]$rmse_performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = NULL,
y=NULL,
col = 'Model',
title = paste0("RMSE"))+
theme(axis.text.x=element_text(size=14, angle = 45),
axis.text.y=element_text(size=14),
axis.title.y=element_text(size=16),
axis.title.x=element_text(size=16),
plot.title=element_text(size=18),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
rsq_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance_all_modalities_CFA [[.]]$tradRsq_performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = NULL,
y=NULL,
col = 'Model',
title = paste0("R squared"))+
theme(axis.text.x=element_text(size=14, angle = 45),
axis.text.y=element_text(size=14),
axis.title.y=element_text(size=16),
axis.title.x=element_text(size=16),
plot.title=element_text(size=18),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
title_combined_plot <- ggdraw() +
draw_label(
"Out-of-site Predictive Ability",
fontface = 'bold',
x = 0,
hjust = 0,
size=24
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 5)
)
plot_combined_list <- list (corr_plot= corr_plot[[1]],
rsq_plot = rsq_plot[[1]],
mae_plot = mae_plot[[1]],
rmse_plot=rmse_plot[[1]])
combined_plot <- ggpubr::ggarrange(title_combined_plot,
ggpubr::ggarrange(plotlist=plot_combined_list,
common.legend = TRUE,
legend = "right"),
nrow = 2 , heights = c(0.1, 1))
## Warning: Removed 3 rows containing missing values.
## Warning: Removed 3 rows containing missing values.
## Warning: Removed 3 rows containing missing values.
## Warning: Removed 3 rows containing missing values.
## Warning: Removed 3 rows containing missing values.
ggpubr::annotate_figure(combined_plot,bottom = ggpubr::text_grob("Site",size=15))